chiark / gitweb /
journald: add some additional checks before we divide by values read from journal...
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
66
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
69
70 /* The mmap context to use for the header we pick as one above the last defined typed */
71 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
72
73 static int journal_file_set_online(JournalFile *f) {
74         assert(f);
75
76         if (!f->writable)
77                 return -EPERM;
78
79         if (!(f->fd >= 0 && f->header))
80                 return -EINVAL;
81
82         if (mmap_cache_got_sigbus(f->mmap, f->fd))
83                 return -EIO;
84
85         switch(f->header->state) {
86                 case STATE_ONLINE:
87                         return 0;
88
89                 case STATE_OFFLINE:
90                         f->header->state = STATE_ONLINE;
91                         fsync(f->fd);
92                         return 0;
93
94                 default:
95                         return -EINVAL;
96         }
97 }
98
99 int journal_file_set_offline(JournalFile *f) {
100         assert(f);
101
102         if (!f->writable)
103                 return -EPERM;
104
105         if (!(f->fd >= 0 && f->header))
106                 return -EINVAL;
107
108         if (f->header->state != STATE_ONLINE)
109                 return 0;
110
111         fsync(f->fd);
112
113         if (mmap_cache_got_sigbus(f->mmap, f->fd))
114                 return -EIO;
115
116         f->header->state = STATE_OFFLINE;
117
118         if (mmap_cache_got_sigbus(f->mmap, f->fd))
119                 return -EIO;
120
121         fsync(f->fd);
122
123         return 0;
124 }
125
126 void journal_file_close(JournalFile *f) {
127         assert(f);
128
129 #ifdef HAVE_GCRYPT
130         /* Write the final tag */
131         if (f->seal && f->writable)
132                 journal_file_append_tag(f);
133 #endif
134
135         journal_file_set_offline(f);
136
137         if (f->mmap && f->fd >= 0)
138                 mmap_cache_close_fd(f->mmap, f->fd);
139
140         safe_close(f->fd);
141         free(f->path);
142
143         if (f->mmap)
144                 mmap_cache_unref(f->mmap);
145
146         ordered_hashmap_free_free(f->chain_cache);
147
148 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
149         free(f->compress_buffer);
150 #endif
151
152 #ifdef HAVE_GCRYPT
153         if (f->fss_file)
154                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
155         else if (f->fsprg_state)
156                 free(f->fsprg_state);
157
158         free(f->fsprg_seed);
159
160         if (f->hmac)
161                 gcry_md_close(f->hmac);
162 #endif
163
164         free(f);
165 }
166
167 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
168         Header h = {};
169         ssize_t k;
170         int r;
171
172         assert(f);
173
174         memcpy(h.signature, HEADER_SIGNATURE, 8);
175         h.header_size = htole64(ALIGN64(sizeof(h)));
176
177         h.incompatible_flags |= htole32(
178                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
179                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
180
181         h.compatible_flags = htole32(
182                 f->seal * HEADER_COMPATIBLE_SEALED);
183
184         r = sd_id128_randomize(&h.file_id);
185         if (r < 0)
186                 return r;
187
188         if (template) {
189                 h.seqnum_id = template->header->seqnum_id;
190                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
191         } else
192                 h.seqnum_id = h.file_id;
193
194         k = pwrite(f->fd, &h, sizeof(h), 0);
195         if (k < 0)
196                 return -errno;
197
198         if (k != sizeof(h))
199                 return -EIO;
200
201         return 0;
202 }
203
204 static int journal_file_refresh_header(JournalFile *f) {
205         sd_id128_t boot_id;
206         int r;
207
208         assert(f);
209
210         r = sd_id128_get_machine(&f->header->machine_id);
211         if (r < 0)
212                 return r;
213
214         r = sd_id128_get_boot(&boot_id);
215         if (r < 0)
216                 return r;
217
218         if (sd_id128_equal(boot_id, f->header->boot_id))
219                 f->tail_entry_monotonic_valid = true;
220
221         f->header->boot_id = boot_id;
222
223         r = journal_file_set_online(f);
224
225         /* Sync the online state to disk */
226         fsync(f->fd);
227
228         return r;
229 }
230
231 static int journal_file_verify_header(JournalFile *f) {
232         uint32_t flags;
233
234         assert(f);
235
236         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
237                 return -EBADMSG;
238
239         /* In both read and write mode we refuse to open files with
240          * incompatible flags we don't know */
241         flags = le32toh(f->header->incompatible_flags);
242         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
243                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
244                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
245                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
246                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
247                 if (flags)
248                         log_debug("Journal file %s uses incompatible flags %"PRIx32
249                                   " disabled at compilation time.", f->path, flags);
250                 return -EPROTONOSUPPORT;
251         }
252
253         /* When open for writing we refuse to open files with
254          * compatible flags, too */
255         flags = le32toh(f->header->compatible_flags);
256         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
257                 if (flags & ~HEADER_COMPATIBLE_ANY)
258                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
259                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
260                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
261                 if (flags)
262                         log_debug("Journal file %s uses compatible flags %"PRIx32
263                                   " disabled at compilation time.", f->path, flags);
264                 return -EPROTONOSUPPORT;
265         }
266
267         if (f->header->state >= _STATE_MAX)
268                 return -EBADMSG;
269
270         /* The first addition was n_data, so check that we are at least this large */
271         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
272                 return -EBADMSG;
273
274         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
275                 return -EBADMSG;
276
277         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
278                 return -ENODATA;
279
280         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
281                 return -ENODATA;
282
283         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
284             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
285             !VALID64(le64toh(f->header->tail_object_offset)) ||
286             !VALID64(le64toh(f->header->entry_array_offset)))
287                 return -ENODATA;
288
289         if (f->writable) {
290                 uint8_t state;
291                 sd_id128_t machine_id;
292                 int r;
293
294                 r = sd_id128_get_machine(&machine_id);
295                 if (r < 0)
296                         return r;
297
298                 if (!sd_id128_equal(machine_id, f->header->machine_id))
299                         return -EHOSTDOWN;
300
301                 state = f->header->state;
302
303                 if (state == STATE_ONLINE) {
304                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
305                         return -EBUSY;
306                 } else if (state == STATE_ARCHIVED)
307                         return -ESHUTDOWN;
308                 else if (state != STATE_OFFLINE) {
309                         log_debug("Journal file %s has unknown state %u.", f->path, state);
310                         return -EBUSY;
311                 }
312         }
313
314         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
315         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
316
317         f->seal = JOURNAL_HEADER_SEALED(f->header);
318
319         return 0;
320 }
321
322 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
323         uint64_t old_size, new_size;
324         int r;
325
326         assert(f);
327
328         /* We assume that this file is not sparse, and we know that
329          * for sure, since we always call posix_fallocate()
330          * ourselves */
331
332         if (mmap_cache_got_sigbus(f->mmap, f->fd))
333                 return -EIO;
334
335         old_size =
336                 le64toh(f->header->header_size) +
337                 le64toh(f->header->arena_size);
338
339         new_size = PAGE_ALIGN(offset + size);
340         if (new_size < le64toh(f->header->header_size))
341                 new_size = le64toh(f->header->header_size);
342
343         if (new_size <= old_size)
344                 return 0;
345
346         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
347                 return -E2BIG;
348
349         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
350                 struct statvfs svfs;
351
352                 if (fstatvfs(f->fd, &svfs) >= 0) {
353                         uint64_t available;
354
355                         available = svfs.f_bfree * svfs.f_bsize;
356
357                         if (available >= f->metrics.keep_free)
358                                 available -= f->metrics.keep_free;
359                         else
360                                 available = 0;
361
362                         if (new_size - old_size > available)
363                                 return -E2BIG;
364                 }
365         }
366
367         /* Increase by larger blocks at once */
368         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
370                 new_size = f->metrics.max_size;
371
372         /* Note that the glibc fallocate() fallback is very
373            inefficient, hence we try to minimize the allocation area
374            as we can. */
375         r = posix_fallocate(f->fd, old_size, new_size - old_size);
376         if (r != 0)
377                 return -r;
378
379         if (fstat(f->fd, &f->last_stat) < 0)
380                 return -errno;
381
382         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
383
384         return 0;
385 }
386
387 static unsigned type_to_context(ObjectType type) {
388         /* One context for each type, plus one catch-all for the rest */
389         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
390         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
391         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
392 }
393
394 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
395         assert(f);
396         assert(ret);
397
398         if (size <= 0)
399                 return -EINVAL;
400
401         /* Avoid SIGBUS on invalid accesses */
402         if (offset + size > (uint64_t) f->last_stat.st_size) {
403                 /* Hmm, out of range? Let's refresh the fstat() data
404                  * first, before we trust that check. */
405
406                 if (fstat(f->fd, &f->last_stat) < 0 ||
407                     offset + size > (uint64_t) f->last_stat.st_size)
408                         return -EADDRNOTAVAIL;
409         }
410
411         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
412 }
413
414 static uint64_t minimum_header_size(Object *o) {
415
416         static const uint64_t table[] = {
417                 [OBJECT_DATA] = sizeof(DataObject),
418                 [OBJECT_FIELD] = sizeof(FieldObject),
419                 [OBJECT_ENTRY] = sizeof(EntryObject),
420                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
421                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
422                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
423                 [OBJECT_TAG] = sizeof(TagObject),
424         };
425
426         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
427                 return sizeof(ObjectHeader);
428
429         return table[o->object.type];
430 }
431
432 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
433         int r;
434         void *t;
435         Object *o;
436         uint64_t s;
437
438         assert(f);
439         assert(ret);
440
441         /* Objects may only be located at multiple of 64 bit */
442         if (!VALID64(offset))
443                 return -EFAULT;
444
445         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
446         if (r < 0)
447                 return r;
448
449         o = (Object*) t;
450         s = le64toh(o->object.size);
451
452         if (s < sizeof(ObjectHeader))
453                 return -EBADMSG;
454
455         if (o->object.type <= OBJECT_UNUSED)
456                 return -EBADMSG;
457
458         if (s < minimum_header_size(o))
459                 return -EBADMSG;
460
461         if (type > OBJECT_UNUSED && o->object.type != type)
462                 return -EBADMSG;
463
464         if (s > sizeof(ObjectHeader)) {
465                 r = journal_file_move_to(f, type, false, offset, s, &t);
466                 if (r < 0)
467                         return r;
468
469                 o = (Object*) t;
470         }
471
472         *ret = o;
473         return 0;
474 }
475
476 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
477         uint64_t r;
478
479         assert(f);
480
481         r = le64toh(f->header->tail_entry_seqnum) + 1;
482
483         if (seqnum) {
484                 /* If an external seqnum counter was passed, we update
485                  * both the local and the external one, and set it to
486                  * the maximum of both */
487
488                 if (*seqnum + 1 > r)
489                         r = *seqnum + 1;
490
491                 *seqnum = r;
492         }
493
494         f->header->tail_entry_seqnum = htole64(r);
495
496         if (f->header->head_entry_seqnum == 0)
497                 f->header->head_entry_seqnum = htole64(r);
498
499         return r;
500 }
501
502 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
503         int r;
504         uint64_t p;
505         Object *tail, *o;
506         void *t;
507
508         assert(f);
509         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
510         assert(size >= sizeof(ObjectHeader));
511         assert(offset);
512         assert(ret);
513
514         r = journal_file_set_online(f);
515         if (r < 0)
516                 return r;
517
518         p = le64toh(f->header->tail_object_offset);
519         if (p == 0)
520                 p = le64toh(f->header->header_size);
521         else {
522                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
523                 if (r < 0)
524                         return r;
525
526                 p += ALIGN64(le64toh(tail->object.size));
527         }
528
529         r = journal_file_allocate(f, p, size);
530         if (r < 0)
531                 return r;
532
533         r = journal_file_move_to(f, type, false, p, size, &t);
534         if (r < 0)
535                 return r;
536
537         o = (Object*) t;
538
539         zero(o->object);
540         o->object.type = type;
541         o->object.size = htole64(size);
542
543         f->header->tail_object_offset = htole64(p);
544         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
545
546         *ret = o;
547         *offset = p;
548
549         return 0;
550 }
551
552 static int journal_file_setup_data_hash_table(JournalFile *f) {
553         uint64_t s, p;
554         Object *o;
555         int r;
556
557         assert(f);
558
559         /* We estimate that we need 1 hash table entry per 768 of
560            journal file and we want to make sure we never get beyond
561            75% fill level. Calculate the hash table size for the
562            maximum file size based on these metrics. */
563
564         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
565         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
566                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
567
568         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
569
570         r = journal_file_append_object(f,
571                                        OBJECT_DATA_HASH_TABLE,
572                                        offsetof(Object, hash_table.items) + s,
573                                        &o, &p);
574         if (r < 0)
575                 return r;
576
577         memzero(o->hash_table.items, s);
578
579         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
580         f->header->data_hash_table_size = htole64(s);
581
582         return 0;
583 }
584
585 static int journal_file_setup_field_hash_table(JournalFile *f) {
586         uint64_t s, p;
587         Object *o;
588         int r;
589
590         assert(f);
591
592         /* We use a fixed size hash table for the fields as this
593          * number should grow very slowly only */
594
595         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
596         r = journal_file_append_object(f,
597                                        OBJECT_FIELD_HASH_TABLE,
598                                        offsetof(Object, hash_table.items) + s,
599                                        &o, &p);
600         if (r < 0)
601                 return r;
602
603         memzero(o->hash_table.items, s);
604
605         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
606         f->header->field_hash_table_size = htole64(s);
607
608         return 0;
609 }
610
611 static int journal_file_map_data_hash_table(JournalFile *f) {
612         uint64_t s, p;
613         void *t;
614         int r;
615
616         assert(f);
617
618         p = le64toh(f->header->data_hash_table_offset);
619         s = le64toh(f->header->data_hash_table_size);
620
621         r = journal_file_move_to(f,
622                                  OBJECT_DATA_HASH_TABLE,
623                                  true,
624                                  p, s,
625                                  &t);
626         if (r < 0)
627                 return r;
628
629         f->data_hash_table = t;
630         return 0;
631 }
632
633 static int journal_file_map_field_hash_table(JournalFile *f) {
634         uint64_t s, p;
635         void *t;
636         int r;
637
638         assert(f);
639
640         p = le64toh(f->header->field_hash_table_offset);
641         s = le64toh(f->header->field_hash_table_size);
642
643         r = journal_file_move_to(f,
644                                  OBJECT_FIELD_HASH_TABLE,
645                                  true,
646                                  p, s,
647                                  &t);
648         if (r < 0)
649                 return r;
650
651         f->field_hash_table = t;
652         return 0;
653 }
654
655 static int journal_file_link_field(
656                 JournalFile *f,
657                 Object *o,
658                 uint64_t offset,
659                 uint64_t hash) {
660
661         uint64_t p, h, m;
662         int r;
663
664         assert(f);
665         assert(o);
666         assert(offset > 0);
667
668         if (o->object.type != OBJECT_FIELD)
669                 return -EINVAL;
670
671         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
672         if (m <= 0)
673                 return -EBADMSG;
674
675         /* This might alter the window we are looking at */
676         o->field.next_hash_offset = o->field.head_data_offset = 0;
677
678         h = hash % m;
679         p = le64toh(f->field_hash_table[h].tail_hash_offset);
680         if (p == 0)
681                 f->field_hash_table[h].head_hash_offset = htole64(offset);
682         else {
683                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
684                 if (r < 0)
685                         return r;
686
687                 o->field.next_hash_offset = htole64(offset);
688         }
689
690         f->field_hash_table[h].tail_hash_offset = htole64(offset);
691
692         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
693                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
694
695         return 0;
696 }
697
698 static int journal_file_link_data(
699                 JournalFile *f,
700                 Object *o,
701                 uint64_t offset,
702                 uint64_t hash) {
703
704         uint64_t p, h, m;
705         int r;
706
707         assert(f);
708         assert(o);
709         assert(offset > 0);
710
711         if (o->object.type != OBJECT_DATA)
712                 return -EINVAL;
713
714         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
715         if (m <= 0)
716                 return -EBADMSG;
717
718         /* This might alter the window we are looking at */
719         o->data.next_hash_offset = o->data.next_field_offset = 0;
720         o->data.entry_offset = o->data.entry_array_offset = 0;
721         o->data.n_entries = 0;
722
723         h = hash % m;
724         p = le64toh(f->data_hash_table[h].tail_hash_offset);
725         if (p == 0)
726                 /* Only entry in the hash table is easy */
727                 f->data_hash_table[h].head_hash_offset = htole64(offset);
728         else {
729                 /* Move back to the previous data object, to patch in
730                  * pointer */
731
732                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
733                 if (r < 0)
734                         return r;
735
736                 o->data.next_hash_offset = htole64(offset);
737         }
738
739         f->data_hash_table[h].tail_hash_offset = htole64(offset);
740
741         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
742                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
743
744         return 0;
745 }
746
747 int journal_file_find_field_object_with_hash(
748                 JournalFile *f,
749                 const void *field, uint64_t size, uint64_t hash,
750                 Object **ret, uint64_t *offset) {
751
752         uint64_t p, osize, h, m;
753         int r;
754
755         assert(f);
756         assert(field && size > 0);
757
758         osize = offsetof(Object, field.payload) + size;
759
760         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
761
762         if (m <= 0)
763                 return -EBADMSG;
764
765         h = hash % m;
766         p = le64toh(f->field_hash_table[h].head_hash_offset);
767
768         while (p > 0) {
769                 Object *o;
770
771                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
772                 if (r < 0)
773                         return r;
774
775                 if (le64toh(o->field.hash) == hash &&
776                     le64toh(o->object.size) == osize &&
777                     memcmp(o->field.payload, field, size) == 0) {
778
779                         if (ret)
780                                 *ret = o;
781                         if (offset)
782                                 *offset = p;
783
784                         return 1;
785                 }
786
787                 p = le64toh(o->field.next_hash_offset);
788         }
789
790         return 0;
791 }
792
793 int journal_file_find_field_object(
794                 JournalFile *f,
795                 const void *field, uint64_t size,
796                 Object **ret, uint64_t *offset) {
797
798         uint64_t hash;
799
800         assert(f);
801         assert(field && size > 0);
802
803         hash = hash64(field, size);
804
805         return journal_file_find_field_object_with_hash(f,
806                                                         field, size, hash,
807                                                         ret, offset);
808 }
809
810 int journal_file_find_data_object_with_hash(
811                 JournalFile *f,
812                 const void *data, uint64_t size, uint64_t hash,
813                 Object **ret, uint64_t *offset) {
814
815         uint64_t p, osize, h, m;
816         int r;
817
818         assert(f);
819         assert(data || size == 0);
820
821         osize = offsetof(Object, data.payload) + size;
822
823         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
824         if (m <= 0)
825                 return -EBADMSG;
826
827         h = hash % m;
828         p = le64toh(f->data_hash_table[h].head_hash_offset);
829
830         while (p > 0) {
831                 Object *o;
832
833                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
834                 if (r < 0)
835                         return r;
836
837                 if (le64toh(o->data.hash) != hash)
838                         goto next;
839
840                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
841 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
842                         uint64_t l;
843                         size_t rsize;
844
845                         l = le64toh(o->object.size);
846                         if (l <= offsetof(Object, data.payload))
847                                 return -EBADMSG;
848
849                         l -= offsetof(Object, data.payload);
850
851                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
852                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
853                         if (r < 0)
854                                 return r;
855
856                         if (rsize == size &&
857                             memcmp(f->compress_buffer, data, size) == 0) {
858
859                                 if (ret)
860                                         *ret = o;
861
862                                 if (offset)
863                                         *offset = p;
864
865                                 return 1;
866                         }
867 #else
868                         return -EPROTONOSUPPORT;
869 #endif
870                 } else if (le64toh(o->object.size) == osize &&
871                            memcmp(o->data.payload, data, size) == 0) {
872
873                         if (ret)
874                                 *ret = o;
875
876                         if (offset)
877                                 *offset = p;
878
879                         return 1;
880                 }
881
882         next:
883                 p = le64toh(o->data.next_hash_offset);
884         }
885
886         return 0;
887 }
888
889 int journal_file_find_data_object(
890                 JournalFile *f,
891                 const void *data, uint64_t size,
892                 Object **ret, uint64_t *offset) {
893
894         uint64_t hash;
895
896         assert(f);
897         assert(data || size == 0);
898
899         hash = hash64(data, size);
900
901         return journal_file_find_data_object_with_hash(f,
902                                                        data, size, hash,
903                                                        ret, offset);
904 }
905
906 static int journal_file_append_field(
907                 JournalFile *f,
908                 const void *field, uint64_t size,
909                 Object **ret, uint64_t *offset) {
910
911         uint64_t hash, p;
912         uint64_t osize;
913         Object *o;
914         int r;
915
916         assert(f);
917         assert(field && size > 0);
918
919         hash = hash64(field, size);
920
921         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
922         if (r < 0)
923                 return r;
924         else if (r > 0) {
925
926                 if (ret)
927                         *ret = o;
928
929                 if (offset)
930                         *offset = p;
931
932                 return 0;
933         }
934
935         osize = offsetof(Object, field.payload) + size;
936         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
937         if (r < 0)
938                 return r;
939
940         o->field.hash = htole64(hash);
941         memcpy(o->field.payload, field, size);
942
943         r = journal_file_link_field(f, o, p, hash);
944         if (r < 0)
945                 return r;
946
947         /* The linking might have altered the window, so let's
948          * refresh our pointer */
949         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
950         if (r < 0)
951                 return r;
952
953 #ifdef HAVE_GCRYPT
954         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
955         if (r < 0)
956                 return r;
957 #endif
958
959         if (ret)
960                 *ret = o;
961
962         if (offset)
963                 *offset = p;
964
965         return 0;
966 }
967
968 static int journal_file_append_data(
969                 JournalFile *f,
970                 const void *data, uint64_t size,
971                 Object **ret, uint64_t *offset) {
972
973         uint64_t hash, p;
974         uint64_t osize;
975         Object *o;
976         int r, compression = 0;
977         const void *eq;
978
979         assert(f);
980         assert(data || size == 0);
981
982         hash = hash64(data, size);
983
984         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
985         if (r < 0)
986                 return r;
987         else if (r > 0) {
988
989                 if (ret)
990                         *ret = o;
991
992                 if (offset)
993                         *offset = p;
994
995                 return 0;
996         }
997
998         osize = offsetof(Object, data.payload) + size;
999         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1000         if (r < 0)
1001                 return r;
1002
1003         o->data.hash = htole64(hash);
1004
1005 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1006         if (f->compress_xz &&
1007             size >= COMPRESSION_SIZE_THRESHOLD) {
1008                 size_t rsize;
1009
1010                 compression = compress_blob(data, size, o->data.payload, &rsize);
1011
1012                 if (compression) {
1013                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1014                         o->object.flags |= compression;
1015
1016                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1017                                   size, rsize, object_compressed_to_string(compression));
1018                 }
1019         }
1020 #endif
1021
1022         if (!compression && size > 0)
1023                 memcpy(o->data.payload, data, size);
1024
1025         r = journal_file_link_data(f, o, p, hash);
1026         if (r < 0)
1027                 return r;
1028
1029         /* The linking might have altered the window, so let's
1030          * refresh our pointer */
1031         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1032         if (r < 0)
1033                 return r;
1034
1035         if (!data)
1036                 eq = NULL;
1037         else
1038                 eq = memchr(data, '=', size);
1039         if (eq && eq > data) {
1040                 Object *fo = NULL;
1041                 uint64_t fp;
1042
1043                 /* Create field object ... */
1044                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1045                 if (r < 0)
1046                         return r;
1047
1048                 /* ... and link it in. */
1049                 o->data.next_field_offset = fo->field.head_data_offset;
1050                 fo->field.head_data_offset = le64toh(p);
1051         }
1052
1053 #ifdef HAVE_GCRYPT
1054         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1055         if (r < 0)
1056                 return r;
1057 #endif
1058
1059         if (ret)
1060                 *ret = o;
1061
1062         if (offset)
1063                 *offset = p;
1064
1065         return 0;
1066 }
1067
1068 uint64_t journal_file_entry_n_items(Object *o) {
1069         assert(o);
1070
1071         if (o->object.type != OBJECT_ENTRY)
1072                 return 0;
1073
1074         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1075 }
1076
1077 uint64_t journal_file_entry_array_n_items(Object *o) {
1078         assert(o);
1079
1080         if (o->object.type != OBJECT_ENTRY_ARRAY)
1081                 return 0;
1082
1083         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1084 }
1085
1086 uint64_t journal_file_hash_table_n_items(Object *o) {
1087         assert(o);
1088
1089         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1090             o->object.type != OBJECT_FIELD_HASH_TABLE)
1091                 return 0;
1092
1093         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1094 }
1095
1096 static int link_entry_into_array(JournalFile *f,
1097                                  le64_t *first,
1098                                  le64_t *idx,
1099                                  uint64_t p) {
1100         int r;
1101         uint64_t n = 0, ap = 0, q, i, a, hidx;
1102         Object *o;
1103
1104         assert(f);
1105         assert(first);
1106         assert(idx);
1107         assert(p > 0);
1108
1109         a = le64toh(*first);
1110         i = hidx = le64toh(*idx);
1111         while (a > 0) {
1112
1113                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1114                 if (r < 0)
1115                         return r;
1116
1117                 n = journal_file_entry_array_n_items(o);
1118                 if (i < n) {
1119                         o->entry_array.items[i] = htole64(p);
1120                         *idx = htole64(hidx + 1);
1121                         return 0;
1122                 }
1123
1124                 i -= n;
1125                 ap = a;
1126                 a = le64toh(o->entry_array.next_entry_array_offset);
1127         }
1128
1129         if (hidx > n)
1130                 n = (hidx+1) * 2;
1131         else
1132                 n = n * 2;
1133
1134         if (n < 4)
1135                 n = 4;
1136
1137         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1138                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1139                                        &o, &q);
1140         if (r < 0)
1141                 return r;
1142
1143 #ifdef HAVE_GCRYPT
1144         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1145         if (r < 0)
1146                 return r;
1147 #endif
1148
1149         o->entry_array.items[i] = htole64(p);
1150
1151         if (ap == 0)
1152                 *first = htole64(q);
1153         else {
1154                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1155                 if (r < 0)
1156                         return r;
1157
1158                 o->entry_array.next_entry_array_offset = htole64(q);
1159         }
1160
1161         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1162                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1163
1164         *idx = htole64(hidx + 1);
1165
1166         return 0;
1167 }
1168
1169 static int link_entry_into_array_plus_one(JournalFile *f,
1170                                           le64_t *extra,
1171                                           le64_t *first,
1172                                           le64_t *idx,
1173                                           uint64_t p) {
1174
1175         int r;
1176
1177         assert(f);
1178         assert(extra);
1179         assert(first);
1180         assert(idx);
1181         assert(p > 0);
1182
1183         if (*idx == 0)
1184                 *extra = htole64(p);
1185         else {
1186                 le64_t i;
1187
1188                 i = htole64(le64toh(*idx) - 1);
1189                 r = link_entry_into_array(f, first, &i, p);
1190                 if (r < 0)
1191                         return r;
1192         }
1193
1194         *idx = htole64(le64toh(*idx) + 1);
1195         return 0;
1196 }
1197
1198 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1199         uint64_t p;
1200         int r;
1201         assert(f);
1202         assert(o);
1203         assert(offset > 0);
1204
1205         p = le64toh(o->entry.items[i].object_offset);
1206         if (p == 0)
1207                 return -EINVAL;
1208
1209         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1210         if (r < 0)
1211                 return r;
1212
1213         return link_entry_into_array_plus_one(f,
1214                                               &o->data.entry_offset,
1215                                               &o->data.entry_array_offset,
1216                                               &o->data.n_entries,
1217                                               offset);
1218 }
1219
1220 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1221         uint64_t n, i;
1222         int r;
1223
1224         assert(f);
1225         assert(o);
1226         assert(offset > 0);
1227
1228         if (o->object.type != OBJECT_ENTRY)
1229                 return -EINVAL;
1230
1231         __sync_synchronize();
1232
1233         /* Link up the entry itself */
1234         r = link_entry_into_array(f,
1235                                   &f->header->entry_array_offset,
1236                                   &f->header->n_entries,
1237                                   offset);
1238         if (r < 0)
1239                 return r;
1240
1241         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1242
1243         if (f->header->head_entry_realtime == 0)
1244                 f->header->head_entry_realtime = o->entry.realtime;
1245
1246         f->header->tail_entry_realtime = o->entry.realtime;
1247         f->header->tail_entry_monotonic = o->entry.monotonic;
1248
1249         f->tail_entry_monotonic_valid = true;
1250
1251         /* Link up the items */
1252         n = journal_file_entry_n_items(o);
1253         for (i = 0; i < n; i++) {
1254                 r = journal_file_link_entry_item(f, o, offset, i);
1255                 if (r < 0)
1256                         return r;
1257         }
1258
1259         return 0;
1260 }
1261
1262 static int journal_file_append_entry_internal(
1263                 JournalFile *f,
1264                 const dual_timestamp *ts,
1265                 uint64_t xor_hash,
1266                 const EntryItem items[], unsigned n_items,
1267                 uint64_t *seqnum,
1268                 Object **ret, uint64_t *offset) {
1269         uint64_t np;
1270         uint64_t osize;
1271         Object *o;
1272         int r;
1273
1274         assert(f);
1275         assert(items || n_items == 0);
1276         assert(ts);
1277
1278         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1279
1280         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1281         if (r < 0)
1282                 return r;
1283
1284         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1285         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1286         o->entry.realtime = htole64(ts->realtime);
1287         o->entry.monotonic = htole64(ts->monotonic);
1288         o->entry.xor_hash = htole64(xor_hash);
1289         o->entry.boot_id = f->header->boot_id;
1290
1291 #ifdef HAVE_GCRYPT
1292         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1293         if (r < 0)
1294                 return r;
1295 #endif
1296
1297         r = journal_file_link_entry(f, o, np);
1298         if (r < 0)
1299                 return r;
1300
1301         if (ret)
1302                 *ret = o;
1303
1304         if (offset)
1305                 *offset = np;
1306
1307         return 0;
1308 }
1309
1310 void journal_file_post_change(JournalFile *f) {
1311         assert(f);
1312
1313         /* inotify() does not receive IN_MODIFY events from file
1314          * accesses done via mmap(). After each access we hence
1315          * trigger IN_MODIFY by truncating the journal file to its
1316          * current size which triggers IN_MODIFY. */
1317
1318         __sync_synchronize();
1319
1320         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1321                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1322 }
1323
1324 static int entry_item_cmp(const void *_a, const void *_b) {
1325         const EntryItem *a = _a, *b = _b;
1326
1327         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1328                 return -1;
1329         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1330                 return 1;
1331         return 0;
1332 }
1333
1334 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1335         unsigned i;
1336         EntryItem *items;
1337         int r;
1338         uint64_t xor_hash = 0;
1339         struct dual_timestamp _ts;
1340
1341         assert(f);
1342         assert(iovec || n_iovec == 0);
1343
1344         if (!ts) {
1345                 dual_timestamp_get(&_ts);
1346                 ts = &_ts;
1347         }
1348
1349         if (f->tail_entry_monotonic_valid &&
1350             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1351                 return -EINVAL;
1352
1353 #ifdef HAVE_GCRYPT
1354         r = journal_file_maybe_append_tag(f, ts->realtime);
1355         if (r < 0)
1356                 return r;
1357 #endif
1358
1359         /* alloca() can't take 0, hence let's allocate at least one */
1360         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1361
1362         for (i = 0; i < n_iovec; i++) {
1363                 uint64_t p;
1364                 Object *o;
1365
1366                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1367                 if (r < 0)
1368                         return r;
1369
1370                 xor_hash ^= le64toh(o->data.hash);
1371                 items[i].object_offset = htole64(p);
1372                 items[i].hash = o->data.hash;
1373         }
1374
1375         /* Order by the position on disk, in order to improve seek
1376          * times for rotating media. */
1377         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1378
1379         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1380
1381         /* If the memory mapping triggered a SIGBUS then we return an
1382          * IO error and ignore the error code passed down to us, since
1383          * it is very likely just an effect of a nullified replacement
1384          * mapping page */
1385
1386         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1387                 r = -EIO;
1388
1389         journal_file_post_change(f);
1390
1391         return r;
1392 }
1393
1394 typedef struct ChainCacheItem {
1395         uint64_t first; /* the array at the beginning of the chain */
1396         uint64_t array; /* the cached array */
1397         uint64_t begin; /* the first item in the cached array */
1398         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1399         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1400 } ChainCacheItem;
1401
1402 static void chain_cache_put(
1403                 OrderedHashmap *h,
1404                 ChainCacheItem *ci,
1405                 uint64_t first,
1406                 uint64_t array,
1407                 uint64_t begin,
1408                 uint64_t total,
1409                 uint64_t last_index) {
1410
1411         if (!ci) {
1412                 /* If the chain item to cache for this chain is the
1413                  * first one it's not worth caching anything */
1414                 if (array == first)
1415                         return;
1416
1417                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1418                         ci = ordered_hashmap_steal_first(h);
1419                         assert(ci);
1420                 } else {
1421                         ci = new(ChainCacheItem, 1);
1422                         if (!ci)
1423                                 return;
1424                 }
1425
1426                 ci->first = first;
1427
1428                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1429                         free(ci);
1430                         return;
1431                 }
1432         } else
1433                 assert(ci->first == first);
1434
1435         ci->array = array;
1436         ci->begin = begin;
1437         ci->total = total;
1438         ci->last_index = last_index;
1439 }
1440
1441 static int generic_array_get(
1442                 JournalFile *f,
1443                 uint64_t first,
1444                 uint64_t i,
1445                 Object **ret, uint64_t *offset) {
1446
1447         Object *o;
1448         uint64_t p = 0, a, t = 0;
1449         int r;
1450         ChainCacheItem *ci;
1451
1452         assert(f);
1453
1454         a = first;
1455
1456         /* Try the chain cache first */
1457         ci = ordered_hashmap_get(f->chain_cache, &first);
1458         if (ci && i > ci->total) {
1459                 a = ci->array;
1460                 i -= ci->total;
1461                 t = ci->total;
1462         }
1463
1464         while (a > 0) {
1465                 uint64_t k;
1466
1467                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1468                 if (r < 0)
1469                         return r;
1470
1471                 k = journal_file_entry_array_n_items(o);
1472                 if (i < k) {
1473                         p = le64toh(o->entry_array.items[i]);
1474                         goto found;
1475                 }
1476
1477                 i -= k;
1478                 t += k;
1479                 a = le64toh(o->entry_array.next_entry_array_offset);
1480         }
1481
1482         return 0;
1483
1484 found:
1485         /* Let's cache this item for the next invocation */
1486         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1487
1488         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1489         if (r < 0)
1490                 return r;
1491
1492         if (ret)
1493                 *ret = o;
1494
1495         if (offset)
1496                 *offset = p;
1497
1498         return 1;
1499 }
1500
1501 static int generic_array_get_plus_one(
1502                 JournalFile *f,
1503                 uint64_t extra,
1504                 uint64_t first,
1505                 uint64_t i,
1506                 Object **ret, uint64_t *offset) {
1507
1508         Object *o;
1509
1510         assert(f);
1511
1512         if (i == 0) {
1513                 int r;
1514
1515                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1516                 if (r < 0)
1517                         return r;
1518
1519                 if (ret)
1520                         *ret = o;
1521
1522                 if (offset)
1523                         *offset = extra;
1524
1525                 return 1;
1526         }
1527
1528         return generic_array_get(f, first, i-1, ret, offset);
1529 }
1530
1531 enum {
1532         TEST_FOUND,
1533         TEST_LEFT,
1534         TEST_RIGHT
1535 };
1536
1537 static int generic_array_bisect(
1538                 JournalFile *f,
1539                 uint64_t first,
1540                 uint64_t n,
1541                 uint64_t needle,
1542                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1543                 direction_t direction,
1544                 Object **ret,
1545                 uint64_t *offset,
1546                 uint64_t *idx) {
1547
1548         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1549         bool subtract_one = false;
1550         Object *o, *array = NULL;
1551         int r;
1552         ChainCacheItem *ci;
1553
1554         assert(f);
1555         assert(test_object);
1556
1557         /* Start with the first array in the chain */
1558         a = first;
1559
1560         ci = ordered_hashmap_get(f->chain_cache, &first);
1561         if (ci && n > ci->total) {
1562                 /* Ah, we have iterated this bisection array chain
1563                  * previously! Let's see if we can skip ahead in the
1564                  * chain, as far as the last time. But we can't jump
1565                  * backwards in the chain, so let's check that
1566                  * first. */
1567
1568                 r = test_object(f, ci->begin, needle);
1569                 if (r < 0)
1570                         return r;
1571
1572                 if (r == TEST_LEFT) {
1573                         /* OK, what we are looking for is right of the
1574                          * begin of this EntryArray, so let's jump
1575                          * straight to previously cached array in the
1576                          * chain */
1577
1578                         a = ci->array;
1579                         n -= ci->total;
1580                         t = ci->total;
1581                         last_index = ci->last_index;
1582                 }
1583         }
1584
1585         while (a > 0) {
1586                 uint64_t left, right, k, lp;
1587
1588                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1589                 if (r < 0)
1590                         return r;
1591
1592                 k = journal_file_entry_array_n_items(array);
1593                 right = MIN(k, n);
1594                 if (right <= 0)
1595                         return 0;
1596
1597                 i = right - 1;
1598                 lp = p = le64toh(array->entry_array.items[i]);
1599                 if (p <= 0)
1600                         return -EBADMSG;
1601
1602                 r = test_object(f, p, needle);
1603                 if (r < 0)
1604                         return r;
1605
1606                 if (r == TEST_FOUND)
1607                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1608
1609                 if (r == TEST_RIGHT) {
1610                         left = 0;
1611                         right -= 1;
1612
1613                         if (last_index != (uint64_t) -1) {
1614                                 assert(last_index <= right);
1615
1616                                 /* If we cached the last index we
1617                                  * looked at, let's try to not to jump
1618                                  * too wildly around and see if we can
1619                                  * limit the range to look at early to
1620                                  * the immediate neighbors of the last
1621                                  * index we looked at. */
1622
1623                                 if (last_index > 0) {
1624                                         uint64_t x = last_index - 1;
1625
1626                                         p = le64toh(array->entry_array.items[x]);
1627                                         if (p <= 0)
1628                                                 return -EBADMSG;
1629
1630                                         r = test_object(f, p, needle);
1631                                         if (r < 0)
1632                                                 return r;
1633
1634                                         if (r == TEST_FOUND)
1635                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1636
1637                                         if (r == TEST_RIGHT)
1638                                                 right = x;
1639                                         else
1640                                                 left = x + 1;
1641                                 }
1642
1643                                 if (last_index < right) {
1644                                         uint64_t y = last_index + 1;
1645
1646                                         p = le64toh(array->entry_array.items[y]);
1647                                         if (p <= 0)
1648                                                 return -EBADMSG;
1649
1650                                         r = test_object(f, p, needle);
1651                                         if (r < 0)
1652                                                 return r;
1653
1654                                         if (r == TEST_FOUND)
1655                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1656
1657                                         if (r == TEST_RIGHT)
1658                                                 right = y;
1659                                         else
1660                                                 left = y + 1;
1661                                 }
1662                         }
1663
1664                         for (;;) {
1665                                 if (left == right) {
1666                                         if (direction == DIRECTION_UP)
1667                                                 subtract_one = true;
1668
1669                                         i = left;
1670                                         goto found;
1671                                 }
1672
1673                                 assert(left < right);
1674                                 i = (left + right) / 2;
1675
1676                                 p = le64toh(array->entry_array.items[i]);
1677                                 if (p <= 0)
1678                                         return -EBADMSG;
1679
1680                                 r = test_object(f, p, needle);
1681                                 if (r < 0)
1682                                         return r;
1683
1684                                 if (r == TEST_FOUND)
1685                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1686
1687                                 if (r == TEST_RIGHT)
1688                                         right = i;
1689                                 else
1690                                         left = i + 1;
1691                         }
1692                 }
1693
1694                 if (k >= n) {
1695                         if (direction == DIRECTION_UP) {
1696                                 i = n;
1697                                 subtract_one = true;
1698                                 goto found;
1699                         }
1700
1701                         return 0;
1702                 }
1703
1704                 last_p = lp;
1705
1706                 n -= k;
1707                 t += k;
1708                 last_index = (uint64_t) -1;
1709                 a = le64toh(array->entry_array.next_entry_array_offset);
1710         }
1711
1712         return 0;
1713
1714 found:
1715         if (subtract_one && t == 0 && i == 0)
1716                 return 0;
1717
1718         /* Let's cache this item for the next invocation */
1719         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1720
1721         if (subtract_one && i == 0)
1722                 p = last_p;
1723         else if (subtract_one)
1724                 p = le64toh(array->entry_array.items[i-1]);
1725         else
1726                 p = le64toh(array->entry_array.items[i]);
1727
1728         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1729         if (r < 0)
1730                 return r;
1731
1732         if (ret)
1733                 *ret = o;
1734
1735         if (offset)
1736                 *offset = p;
1737
1738         if (idx)
1739                 *idx = t + i + (subtract_one ? -1 : 0);
1740
1741         return 1;
1742 }
1743
1744 static int generic_array_bisect_plus_one(
1745                 JournalFile *f,
1746                 uint64_t extra,
1747                 uint64_t first,
1748                 uint64_t n,
1749                 uint64_t needle,
1750                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1751                 direction_t direction,
1752                 Object **ret,
1753                 uint64_t *offset,
1754                 uint64_t *idx) {
1755
1756         int r;
1757         bool step_back = false;
1758         Object *o;
1759
1760         assert(f);
1761         assert(test_object);
1762
1763         if (n <= 0)
1764                 return 0;
1765
1766         /* This bisects the array in object 'first', but first checks
1767          * an extra  */
1768         r = test_object(f, extra, needle);
1769         if (r < 0)
1770                 return r;
1771
1772         if (r == TEST_FOUND)
1773                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1774
1775         /* if we are looking with DIRECTION_UP then we need to first
1776            see if in the actual array there is a matching entry, and
1777            return the last one of that. But if there isn't any we need
1778            to return this one. Hence remember this, and return it
1779            below. */
1780         if (r == TEST_LEFT)
1781                 step_back = direction == DIRECTION_UP;
1782
1783         if (r == TEST_RIGHT) {
1784                 if (direction == DIRECTION_DOWN)
1785                         goto found;
1786                 else
1787                         return 0;
1788         }
1789
1790         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1791
1792         if (r == 0 && step_back)
1793                 goto found;
1794
1795         if (r > 0 && idx)
1796                 (*idx) ++;
1797
1798         return r;
1799
1800 found:
1801         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1802         if (r < 0)
1803                 return r;
1804
1805         if (ret)
1806                 *ret = o;
1807
1808         if (offset)
1809                 *offset = extra;
1810
1811         if (idx)
1812                 *idx = 0;
1813
1814         return 1;
1815 }
1816
1817 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1818         assert(f);
1819         assert(p > 0);
1820
1821         if (p == needle)
1822                 return TEST_FOUND;
1823         else if (p < needle)
1824                 return TEST_LEFT;
1825         else
1826                 return TEST_RIGHT;
1827 }
1828
1829 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1830         Object *o;
1831         int r;
1832
1833         assert(f);
1834         assert(p > 0);
1835
1836         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1837         if (r < 0)
1838                 return r;
1839
1840         if (le64toh(o->entry.seqnum) == needle)
1841                 return TEST_FOUND;
1842         else if (le64toh(o->entry.seqnum) < needle)
1843                 return TEST_LEFT;
1844         else
1845                 return TEST_RIGHT;
1846 }
1847
1848 int journal_file_move_to_entry_by_seqnum(
1849                 JournalFile *f,
1850                 uint64_t seqnum,
1851                 direction_t direction,
1852                 Object **ret,
1853                 uint64_t *offset) {
1854
1855         return generic_array_bisect(f,
1856                                     le64toh(f->header->entry_array_offset),
1857                                     le64toh(f->header->n_entries),
1858                                     seqnum,
1859                                     test_object_seqnum,
1860                                     direction,
1861                                     ret, offset, NULL);
1862 }
1863
1864 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1865         Object *o;
1866         int r;
1867
1868         assert(f);
1869         assert(p > 0);
1870
1871         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1872         if (r < 0)
1873                 return r;
1874
1875         if (le64toh(o->entry.realtime) == needle)
1876                 return TEST_FOUND;
1877         else if (le64toh(o->entry.realtime) < needle)
1878                 return TEST_LEFT;
1879         else
1880                 return TEST_RIGHT;
1881 }
1882
1883 int journal_file_move_to_entry_by_realtime(
1884                 JournalFile *f,
1885                 uint64_t realtime,
1886                 direction_t direction,
1887                 Object **ret,
1888                 uint64_t *offset) {
1889
1890         return generic_array_bisect(f,
1891                                     le64toh(f->header->entry_array_offset),
1892                                     le64toh(f->header->n_entries),
1893                                     realtime,
1894                                     test_object_realtime,
1895                                     direction,
1896                                     ret, offset, NULL);
1897 }
1898
1899 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1900         Object *o;
1901         int r;
1902
1903         assert(f);
1904         assert(p > 0);
1905
1906         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1907         if (r < 0)
1908                 return r;
1909
1910         if (le64toh(o->entry.monotonic) == needle)
1911                 return TEST_FOUND;
1912         else if (le64toh(o->entry.monotonic) < needle)
1913                 return TEST_LEFT;
1914         else
1915                 return TEST_RIGHT;
1916 }
1917
1918 static inline int find_data_object_by_boot_id(
1919                 JournalFile *f,
1920                 sd_id128_t boot_id,
1921                 Object **o,
1922                 uint64_t *b) {
1923         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1924
1925         sd_id128_to_string(boot_id, t + 9);
1926         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1927 }
1928
1929 int journal_file_move_to_entry_by_monotonic(
1930                 JournalFile *f,
1931                 sd_id128_t boot_id,
1932                 uint64_t monotonic,
1933                 direction_t direction,
1934                 Object **ret,
1935                 uint64_t *offset) {
1936
1937         Object *o;
1938         int r;
1939
1940         assert(f);
1941
1942         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1943         if (r < 0)
1944                 return r;
1945         if (r == 0)
1946                 return -ENOENT;
1947
1948         return generic_array_bisect_plus_one(f,
1949                                              le64toh(o->data.entry_offset),
1950                                              le64toh(o->data.entry_array_offset),
1951                                              le64toh(o->data.n_entries),
1952                                              monotonic,
1953                                              test_object_monotonic,
1954                                              direction,
1955                                              ret, offset, NULL);
1956 }
1957
1958 void journal_file_reset_location(JournalFile *f) {
1959         f->location_type = LOCATION_HEAD;
1960         f->current_offset = 0;
1961         f->current_seqnum = 0;
1962         f->current_realtime = 0;
1963         f->current_monotonic = 0;
1964         zero(f->current_boot_id);
1965         f->current_xor_hash = 0;
1966 }
1967
1968 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1969         f->last_direction = direction;
1970         f->location_type = LOCATION_SEEK;
1971         f->current_offset = offset;
1972         f->current_seqnum = le64toh(o->entry.seqnum);
1973         f->current_realtime = le64toh(o->entry.realtime);
1974         f->current_monotonic = le64toh(o->entry.monotonic);
1975         f->current_boot_id = o->entry.boot_id;
1976         f->current_xor_hash = le64toh(o->entry.xor_hash);
1977 }
1978
1979 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1980         assert(af);
1981         assert(bf);
1982         assert(af->location_type == LOCATION_SEEK);
1983         assert(bf->location_type == LOCATION_SEEK);
1984
1985         /* If contents and timestamps match, these entries are
1986          * identical, even if the seqnum does not match */
1987         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1988             af->current_monotonic == bf->current_monotonic &&
1989             af->current_realtime == bf->current_realtime &&
1990             af->current_xor_hash == bf->current_xor_hash)
1991                 return 0;
1992
1993         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1994
1995                 /* If this is from the same seqnum source, compare
1996                  * seqnums */
1997                 if (af->current_seqnum < bf->current_seqnum)
1998                         return -1;
1999                 if (af->current_seqnum > bf->current_seqnum)
2000                         return 1;
2001
2002                 /* Wow! This is weird, different data but the same
2003                  * seqnums? Something is borked, but let's make the
2004                  * best of it and compare by time. */
2005         }
2006
2007         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2008
2009                 /* If the boot id matches, compare monotonic time */
2010                 if (af->current_monotonic < bf->current_monotonic)
2011                         return -1;
2012                 if (af->current_monotonic > bf->current_monotonic)
2013                         return 1;
2014         }
2015
2016         /* Otherwise, compare UTC time */
2017         if (af->current_realtime < bf->current_realtime)
2018                 return -1;
2019         if (af->current_realtime > bf->current_realtime)
2020                 return 1;
2021
2022         /* Finally, compare by contents */
2023         if (af->current_xor_hash < bf->current_xor_hash)
2024                 return -1;
2025         if (af->current_xor_hash > bf->current_xor_hash)
2026                 return 1;
2027
2028         return 0;
2029 }
2030
2031 int journal_file_next_entry(
2032                 JournalFile *f,
2033                 uint64_t p,
2034                 direction_t direction,
2035                 Object **ret, uint64_t *offset) {
2036
2037         uint64_t i, n, ofs;
2038         int r;
2039
2040         assert(f);
2041
2042         n = le64toh(f->header->n_entries);
2043         if (n <= 0)
2044                 return 0;
2045
2046         if (p == 0)
2047                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2048         else {
2049                 r = generic_array_bisect(f,
2050                                          le64toh(f->header->entry_array_offset),
2051                                          le64toh(f->header->n_entries),
2052                                          p,
2053                                          test_object_offset,
2054                                          DIRECTION_DOWN,
2055                                          NULL, NULL,
2056                                          &i);
2057                 if (r <= 0)
2058                         return r;
2059
2060                 if (direction == DIRECTION_DOWN) {
2061                         if (i >= n - 1)
2062                                 return 0;
2063
2064                         i++;
2065                 } else {
2066                         if (i <= 0)
2067                                 return 0;
2068
2069                         i--;
2070                 }
2071         }
2072
2073         /* And jump to it */
2074         r = generic_array_get(f,
2075                               le64toh(f->header->entry_array_offset),
2076                               i,
2077                               ret, &ofs);
2078         if (r <= 0)
2079                 return r;
2080
2081         if (p > 0 &&
2082             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2083                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2084                           f->path, i);
2085                 return -EBADMSG;
2086         }
2087
2088         if (offset)
2089                 *offset = ofs;
2090
2091         return 1;
2092 }
2093
2094 int journal_file_next_entry_for_data(
2095                 JournalFile *f,
2096                 Object *o, uint64_t p,
2097                 uint64_t data_offset,
2098                 direction_t direction,
2099                 Object **ret, uint64_t *offset) {
2100
2101         uint64_t n, i;
2102         int r;
2103         Object *d;
2104
2105         assert(f);
2106         assert(p > 0 || !o);
2107
2108         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2109         if (r < 0)
2110                 return r;
2111
2112         n = le64toh(d->data.n_entries);
2113         if (n <= 0)
2114                 return n;
2115
2116         if (!o)
2117                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2118         else {
2119                 if (o->object.type != OBJECT_ENTRY)
2120                         return -EINVAL;
2121
2122                 r = generic_array_bisect_plus_one(f,
2123                                                   le64toh(d->data.entry_offset),
2124                                                   le64toh(d->data.entry_array_offset),
2125                                                   le64toh(d->data.n_entries),
2126                                                   p,
2127                                                   test_object_offset,
2128                                                   DIRECTION_DOWN,
2129                                                   NULL, NULL,
2130                                                   &i);
2131
2132                 if (r <= 0)
2133                         return r;
2134
2135                 if (direction == DIRECTION_DOWN) {
2136                         if (i >= n - 1)
2137                                 return 0;
2138
2139                         i++;
2140                 } else {
2141                         if (i <= 0)
2142                                 return 0;
2143
2144                         i--;
2145                 }
2146
2147         }
2148
2149         return generic_array_get_plus_one(f,
2150                                           le64toh(d->data.entry_offset),
2151                                           le64toh(d->data.entry_array_offset),
2152                                           i,
2153                                           ret, offset);
2154 }
2155
2156 int journal_file_move_to_entry_by_offset_for_data(
2157                 JournalFile *f,
2158                 uint64_t data_offset,
2159                 uint64_t p,
2160                 direction_t direction,
2161                 Object **ret, uint64_t *offset) {
2162
2163         int r;
2164         Object *d;
2165
2166         assert(f);
2167
2168         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2169         if (r < 0)
2170                 return r;
2171
2172         return generic_array_bisect_plus_one(f,
2173                                              le64toh(d->data.entry_offset),
2174                                              le64toh(d->data.entry_array_offset),
2175                                              le64toh(d->data.n_entries),
2176                                              p,
2177                                              test_object_offset,
2178                                              direction,
2179                                              ret, offset, NULL);
2180 }
2181
2182 int journal_file_move_to_entry_by_monotonic_for_data(
2183                 JournalFile *f,
2184                 uint64_t data_offset,
2185                 sd_id128_t boot_id,
2186                 uint64_t monotonic,
2187                 direction_t direction,
2188                 Object **ret, uint64_t *offset) {
2189
2190         Object *o, *d;
2191         int r;
2192         uint64_t b, z;
2193
2194         assert(f);
2195
2196         /* First, seek by time */
2197         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2198         if (r < 0)
2199                 return r;
2200         if (r == 0)
2201                 return -ENOENT;
2202
2203         r = generic_array_bisect_plus_one(f,
2204                                           le64toh(o->data.entry_offset),
2205                                           le64toh(o->data.entry_array_offset),
2206                                           le64toh(o->data.n_entries),
2207                                           monotonic,
2208                                           test_object_monotonic,
2209                                           direction,
2210                                           NULL, &z, NULL);
2211         if (r <= 0)
2212                 return r;
2213
2214         /* And now, continue seeking until we find an entry that
2215          * exists in both bisection arrays */
2216
2217         for (;;) {
2218                 Object *qo;
2219                 uint64_t p, q;
2220
2221                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2222                 if (r < 0)
2223                         return r;
2224
2225                 r = generic_array_bisect_plus_one(f,
2226                                                   le64toh(d->data.entry_offset),
2227                                                   le64toh(d->data.entry_array_offset),
2228                                                   le64toh(d->data.n_entries),
2229                                                   z,
2230                                                   test_object_offset,
2231                                                   direction,
2232                                                   NULL, &p, NULL);
2233                 if (r <= 0)
2234                         return r;
2235
2236                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2237                 if (r < 0)
2238                         return r;
2239
2240                 r = generic_array_bisect_plus_one(f,
2241                                                   le64toh(o->data.entry_offset),
2242                                                   le64toh(o->data.entry_array_offset),
2243                                                   le64toh(o->data.n_entries),
2244                                                   p,
2245                                                   test_object_offset,
2246                                                   direction,
2247                                                   &qo, &q, NULL);
2248
2249                 if (r <= 0)
2250                         return r;
2251
2252                 if (p == q) {
2253                         if (ret)
2254                                 *ret = qo;
2255                         if (offset)
2256                                 *offset = q;
2257
2258                         return 1;
2259                 }
2260
2261                 z = q;
2262         }
2263 }
2264
2265 int journal_file_move_to_entry_by_seqnum_for_data(
2266                 JournalFile *f,
2267                 uint64_t data_offset,
2268                 uint64_t seqnum,
2269                 direction_t direction,
2270                 Object **ret, uint64_t *offset) {
2271
2272         Object *d;
2273         int r;
2274
2275         assert(f);
2276
2277         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2278         if (r < 0)
2279                 return r;
2280
2281         return generic_array_bisect_plus_one(f,
2282                                              le64toh(d->data.entry_offset),
2283                                              le64toh(d->data.entry_array_offset),
2284                                              le64toh(d->data.n_entries),
2285                                              seqnum,
2286                                              test_object_seqnum,
2287                                              direction,
2288                                              ret, offset, NULL);
2289 }
2290
2291 int journal_file_move_to_entry_by_realtime_for_data(
2292                 JournalFile *f,
2293                 uint64_t data_offset,
2294                 uint64_t realtime,
2295                 direction_t direction,
2296                 Object **ret, uint64_t *offset) {
2297
2298         Object *d;
2299         int r;
2300
2301         assert(f);
2302
2303         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2304         if (r < 0)
2305                 return r;
2306
2307         return generic_array_bisect_plus_one(f,
2308                                              le64toh(d->data.entry_offset),
2309                                              le64toh(d->data.entry_array_offset),
2310                                              le64toh(d->data.n_entries),
2311                                              realtime,
2312                                              test_object_realtime,
2313                                              direction,
2314                                              ret, offset, NULL);
2315 }
2316
2317 void journal_file_dump(JournalFile *f) {
2318         Object *o;
2319         int r;
2320         uint64_t p;
2321
2322         assert(f);
2323
2324         journal_file_print_header(f);
2325
2326         p = le64toh(f->header->header_size);
2327         while (p != 0) {
2328                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2329                 if (r < 0)
2330                         goto fail;
2331
2332                 switch (o->object.type) {
2333
2334                 case OBJECT_UNUSED:
2335                         printf("Type: OBJECT_UNUSED\n");
2336                         break;
2337
2338                 case OBJECT_DATA:
2339                         printf("Type: OBJECT_DATA\n");
2340                         break;
2341
2342                 case OBJECT_FIELD:
2343                         printf("Type: OBJECT_FIELD\n");
2344                         break;
2345
2346                 case OBJECT_ENTRY:
2347                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2348                                le64toh(o->entry.seqnum),
2349                                le64toh(o->entry.monotonic),
2350                                le64toh(o->entry.realtime));
2351                         break;
2352
2353                 case OBJECT_FIELD_HASH_TABLE:
2354                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2355                         break;
2356
2357                 case OBJECT_DATA_HASH_TABLE:
2358                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2359                         break;
2360
2361                 case OBJECT_ENTRY_ARRAY:
2362                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2363                         break;
2364
2365                 case OBJECT_TAG:
2366                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2367                                le64toh(o->tag.seqnum),
2368                                le64toh(o->tag.epoch));
2369                         break;
2370
2371                 default:
2372                         printf("Type: unknown (%u)\n", o->object.type);
2373                         break;
2374                 }
2375
2376                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2377                         printf("Flags: %s\n",
2378                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2379
2380                 if (p == le64toh(f->header->tail_object_offset))
2381                         p = 0;
2382                 else
2383                         p = p + ALIGN64(le64toh(o->object.size));
2384         }
2385
2386         return;
2387 fail:
2388         log_error("File corrupt");
2389 }
2390
2391 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2392         const char *x;
2393
2394         x = format_timestamp(buf, l, t);
2395         if (x)
2396                 return x;
2397         return " --- ";
2398 }
2399
2400 void journal_file_print_header(JournalFile *f) {
2401         char a[33], b[33], c[33], d[33];
2402         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2403         struct stat st;
2404         char bytes[FORMAT_BYTES_MAX];
2405
2406         assert(f);
2407
2408         printf("File Path: %s\n"
2409                "File ID: %s\n"
2410                "Machine ID: %s\n"
2411                "Boot ID: %s\n"
2412                "Sequential Number ID: %s\n"
2413                "State: %s\n"
2414                "Compatible Flags:%s%s\n"
2415                "Incompatible Flags:%s%s%s\n"
2416                "Header size: %"PRIu64"\n"
2417                "Arena size: %"PRIu64"\n"
2418                "Data Hash Table Size: %"PRIu64"\n"
2419                "Field Hash Table Size: %"PRIu64"\n"
2420                "Rotate Suggested: %s\n"
2421                "Head Sequential Number: %"PRIu64"\n"
2422                "Tail Sequential Number: %"PRIu64"\n"
2423                "Head Realtime Timestamp: %s\n"
2424                "Tail Realtime Timestamp: %s\n"
2425                "Tail Monotonic Timestamp: %s\n"
2426                "Objects: %"PRIu64"\n"
2427                "Entry Objects: %"PRIu64"\n",
2428                f->path,
2429                sd_id128_to_string(f->header->file_id, a),
2430                sd_id128_to_string(f->header->machine_id, b),
2431                sd_id128_to_string(f->header->boot_id, c),
2432                sd_id128_to_string(f->header->seqnum_id, d),
2433                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2434                f->header->state == STATE_ONLINE ? "ONLINE" :
2435                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2436                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2437                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2438                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2439                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2440                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2441                le64toh(f->header->header_size),
2442                le64toh(f->header->arena_size),
2443                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2444                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2445                yes_no(journal_file_rotate_suggested(f, 0)),
2446                le64toh(f->header->head_entry_seqnum),
2447                le64toh(f->header->tail_entry_seqnum),
2448                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2449                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2450                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2451                le64toh(f->header->n_objects),
2452                le64toh(f->header->n_entries));
2453
2454         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2455                 printf("Data Objects: %"PRIu64"\n"
2456                        "Data Hash Table Fill: %.1f%%\n",
2457                        le64toh(f->header->n_data),
2458                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2459
2460         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2461                 printf("Field Objects: %"PRIu64"\n"
2462                        "Field Hash Table Fill: %.1f%%\n",
2463                        le64toh(f->header->n_fields),
2464                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2465
2466         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2467                 printf("Tag Objects: %"PRIu64"\n",
2468                        le64toh(f->header->n_tags));
2469         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2470                 printf("Entry Array Objects: %"PRIu64"\n",
2471                        le64toh(f->header->n_entry_arrays));
2472
2473         if (fstat(f->fd, &st) >= 0)
2474                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2475 }
2476
2477 int journal_file_open(
2478                 const char *fname,
2479                 int flags,
2480                 mode_t mode,
2481                 bool compress,
2482                 bool seal,
2483                 JournalMetrics *metrics,
2484                 MMapCache *mmap_cache,
2485                 JournalFile *template,
2486                 JournalFile **ret) {
2487
2488         bool newly_created = false;
2489         JournalFile *f;
2490         void *h;
2491         int r;
2492
2493         assert(fname);
2494         assert(ret);
2495
2496         if ((flags & O_ACCMODE) != O_RDONLY &&
2497             (flags & O_ACCMODE) != O_RDWR)
2498                 return -EINVAL;
2499
2500         if (!endswith(fname, ".journal") &&
2501             !endswith(fname, ".journal~"))
2502                 return -EINVAL;
2503
2504         f = new0(JournalFile, 1);
2505         if (!f)
2506                 return -ENOMEM;
2507
2508         f->fd = -1;
2509         f->mode = mode;
2510
2511         f->flags = flags;
2512         f->prot = prot_from_flags(flags);
2513         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2514 #if defined(HAVE_LZ4)
2515         f->compress_lz4 = compress;
2516 #elif defined(HAVE_XZ)
2517         f->compress_xz = compress;
2518 #endif
2519 #ifdef HAVE_GCRYPT
2520         f->seal = seal;
2521 #endif
2522
2523         if (mmap_cache)
2524                 f->mmap = mmap_cache_ref(mmap_cache);
2525         else {
2526                 f->mmap = mmap_cache_new();
2527                 if (!f->mmap) {
2528                         r = -ENOMEM;
2529                         goto fail;
2530                 }
2531         }
2532
2533         f->path = strdup(fname);
2534         if (!f->path) {
2535                 r = -ENOMEM;
2536                 goto fail;
2537         }
2538
2539         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2540         if (!f->chain_cache) {
2541                 r = -ENOMEM;
2542                 goto fail;
2543         }
2544
2545         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2546         if (f->fd < 0) {
2547                 r = -errno;
2548                 goto fail;
2549         }
2550
2551         if (fstat(f->fd, &f->last_stat) < 0) {
2552                 r = -errno;
2553                 goto fail;
2554         }
2555
2556         if (f->last_stat.st_size == 0 && f->writable) {
2557                 /* Let's attach the creation time to the journal file,
2558                  * so that the vacuuming code knows the age of this
2559                  * file even if the file might end up corrupted one
2560                  * day... Ideally we'd just use the creation time many
2561                  * file systems maintain for each file, but there is
2562                  * currently no usable API to query this, hence let's
2563                  * emulate this via extended attributes. If extended
2564                  * attributes are not supported we'll just skip this,
2565                  * and rely solely on mtime/atime/ctime of the file. */
2566
2567                 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2568
2569 #ifdef HAVE_GCRYPT
2570                 /* Try to load the FSPRG state, and if we can't, then
2571                  * just don't do sealing */
2572                 if (f->seal) {
2573                         r = journal_file_fss_load(f);
2574                         if (r < 0)
2575                                 f->seal = false;
2576                 }
2577 #endif
2578
2579                 r = journal_file_init_header(f, template);
2580                 if (r < 0)
2581                         goto fail;
2582
2583                 if (fstat(f->fd, &f->last_stat) < 0) {
2584                         r = -errno;
2585                         goto fail;
2586                 }
2587
2588                 newly_created = true;
2589         }
2590
2591         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2592                 r = -EIO;
2593                 goto fail;
2594         }
2595
2596         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2597         if (r < 0) {
2598                 r = -errno;
2599                 goto fail;
2600         }
2601
2602         f->header = h;
2603
2604         if (!newly_created) {
2605                 r = journal_file_verify_header(f);
2606                 if (r < 0)
2607                         goto fail;
2608         }
2609
2610 #ifdef HAVE_GCRYPT
2611         if (!newly_created && f->writable) {
2612                 r = journal_file_fss_load(f);
2613                 if (r < 0)
2614                         goto fail;
2615         }
2616 #endif
2617
2618         if (f->writable) {
2619                 if (metrics) {
2620                         journal_default_metrics(metrics, f->fd);
2621                         f->metrics = *metrics;
2622                 } else if (template)
2623                         f->metrics = template->metrics;
2624
2625                 r = journal_file_refresh_header(f);
2626                 if (r < 0)
2627                         goto fail;
2628         }
2629
2630 #ifdef HAVE_GCRYPT
2631         r = journal_file_hmac_setup(f);
2632         if (r < 0)
2633                 goto fail;
2634 #endif
2635
2636         if (newly_created) {
2637                 r = journal_file_setup_field_hash_table(f);
2638                 if (r < 0)
2639                         goto fail;
2640
2641                 r = journal_file_setup_data_hash_table(f);
2642                 if (r < 0)
2643                         goto fail;
2644
2645 #ifdef HAVE_GCRYPT
2646                 r = journal_file_append_first_tag(f);
2647                 if (r < 0)
2648                         goto fail;
2649 #endif
2650         }
2651
2652         r = journal_file_map_field_hash_table(f);
2653         if (r < 0)
2654                 goto fail;
2655
2656         r = journal_file_map_data_hash_table(f);
2657         if (r < 0)
2658                 goto fail;
2659
2660         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2661                 r = -EIO;
2662                 goto fail;
2663         }
2664
2665         *ret = f;
2666         return 0;
2667
2668 fail:
2669         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2670                 r = -EIO;
2671
2672         journal_file_close(f);
2673
2674         return r;
2675 }
2676
2677 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2678         _cleanup_free_ char *p = NULL;
2679         size_t l;
2680         JournalFile *old_file, *new_file = NULL;
2681         int r;
2682
2683         assert(f);
2684         assert(*f);
2685
2686         old_file = *f;
2687
2688         if (!old_file->writable)
2689                 return -EINVAL;
2690
2691         if (!endswith(old_file->path, ".journal"))
2692                 return -EINVAL;
2693
2694         l = strlen(old_file->path);
2695         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2696                      (int) l - 8, old_file->path,
2697                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2698                      le64toh((*f)->header->head_entry_seqnum),
2699                      le64toh((*f)->header->head_entry_realtime));
2700         if (r < 0)
2701                 return -ENOMEM;
2702
2703         r = rename(old_file->path, p);
2704         if (r < 0)
2705                 return -errno;
2706
2707         old_file->header->state = STATE_ARCHIVED;
2708
2709         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2710         journal_file_close(old_file);
2711
2712         *f = new_file;
2713         return r;
2714 }
2715
2716 int journal_file_open_reliably(
2717                 const char *fname,
2718                 int flags,
2719                 mode_t mode,
2720                 bool compress,
2721                 bool seal,
2722                 JournalMetrics *metrics,
2723                 MMapCache *mmap_cache,
2724                 JournalFile *template,
2725                 JournalFile **ret) {
2726
2727         int r;
2728         size_t l;
2729         _cleanup_free_ char *p = NULL;
2730
2731         r = journal_file_open(fname, flags, mode, compress, seal,
2732                               metrics, mmap_cache, template, ret);
2733         if (r != -EBADMSG && /* corrupted */
2734             r != -ENODATA && /* truncated */
2735             r != -EHOSTDOWN && /* other machine */
2736             r != -EPROTONOSUPPORT && /* incompatible feature */
2737             r != -EBUSY && /* unclean shutdown */
2738             r != -ESHUTDOWN && /* already archived */
2739             r != -EIO /* IO error, including SIGBUS on mmap */)
2740                 return r;
2741
2742         if ((flags & O_ACCMODE) == O_RDONLY)
2743                 return r;
2744
2745         if (!(flags & O_CREAT))
2746                 return r;
2747
2748         if (!endswith(fname, ".journal"))
2749                 return r;
2750
2751         /* The file is corrupted. Rotate it away and try it again (but only once) */
2752
2753         l = strlen(fname);
2754         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2755                      (int) l - 8, fname,
2756                      (unsigned long long) now(CLOCK_REALTIME),
2757                      random_u64()) < 0)
2758                 return -ENOMEM;
2759
2760         r = rename(fname, p);
2761         if (r < 0)
2762                 return -errno;
2763
2764         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2765
2766         return journal_file_open(fname, flags, mode, compress, seal,
2767                                  metrics, mmap_cache, template, ret);
2768 }
2769
2770 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2771         uint64_t i, n;
2772         uint64_t q, xor_hash = 0;
2773         int r;
2774         EntryItem *items;
2775         dual_timestamp ts;
2776
2777         assert(from);
2778         assert(to);
2779         assert(o);
2780         assert(p);
2781
2782         if (!to->writable)
2783                 return -EPERM;
2784
2785         ts.monotonic = le64toh(o->entry.monotonic);
2786         ts.realtime = le64toh(o->entry.realtime);
2787
2788         n = journal_file_entry_n_items(o);
2789         /* alloca() can't take 0, hence let's allocate at least one */
2790         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2791
2792         for (i = 0; i < n; i++) {
2793                 uint64_t l, h;
2794                 le64_t le_hash;
2795                 size_t t;
2796                 void *data;
2797                 Object *u;
2798
2799                 q = le64toh(o->entry.items[i].object_offset);
2800                 le_hash = o->entry.items[i].hash;
2801
2802                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2803                 if (r < 0)
2804                         return r;
2805
2806                 if (le_hash != o->data.hash)
2807                         return -EBADMSG;
2808
2809                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2810                 t = (size_t) l;
2811
2812                 /* We hit the limit on 32bit machines */
2813                 if ((uint64_t) t != l)
2814                         return -E2BIG;
2815
2816                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2817 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2818                         size_t rsize;
2819
2820                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2821                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2822                         if (r < 0)
2823                                 return r;
2824
2825                         data = from->compress_buffer;
2826                         l = rsize;
2827 #else
2828                         return -EPROTONOSUPPORT;
2829 #endif
2830                 } else
2831                         data = o->data.payload;
2832
2833                 r = journal_file_append_data(to, data, l, &u, &h);
2834                 if (r < 0)
2835                         return r;
2836
2837                 xor_hash ^= le64toh(u->data.hash);
2838                 items[i].object_offset = htole64(h);
2839                 items[i].hash = u->data.hash;
2840
2841                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2842                 if (r < 0)
2843                         return r;
2844         }
2845
2846         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2847
2848         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2849                 return -EIO;
2850
2851         return r;
2852 }
2853
2854 void journal_default_metrics(JournalMetrics *m, int fd) {
2855         uint64_t fs_size = 0;
2856         struct statvfs ss;
2857         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2858
2859         assert(m);
2860         assert(fd >= 0);
2861
2862         if (fstatvfs(fd, &ss) >= 0)
2863                 fs_size = ss.f_frsize * ss.f_blocks;
2864
2865         if (m->max_use == (uint64_t) -1) {
2866
2867                 if (fs_size > 0) {
2868                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2869
2870                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2871                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2872
2873                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2874                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2875                 } else
2876                         m->max_use = DEFAULT_MAX_USE_LOWER;
2877         } else {
2878                 m->max_use = PAGE_ALIGN(m->max_use);
2879
2880                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2881                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2882         }
2883
2884         if (m->max_size == (uint64_t) -1) {
2885                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2886
2887                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2888                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2889         } else
2890                 m->max_size = PAGE_ALIGN(m->max_size);
2891
2892         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2893                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2894
2895         if (m->max_size*2 > m->max_use)
2896                 m->max_use = m->max_size*2;
2897
2898         if (m->min_size == (uint64_t) -1)
2899                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2900         else {
2901                 m->min_size = PAGE_ALIGN(m->min_size);
2902
2903                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2904                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2905
2906                 if (m->min_size > m->max_size)
2907                         m->max_size = m->min_size;
2908         }
2909
2910         if (m->keep_free == (uint64_t) -1) {
2911
2912                 if (fs_size > 0) {
2913                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2914
2915                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2916                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2917
2918                 } else
2919                         m->keep_free = DEFAULT_KEEP_FREE;
2920         }
2921
2922         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2923                   format_bytes(a, sizeof(a), m->max_use),
2924                   format_bytes(b, sizeof(b), m->max_size),
2925                   format_bytes(c, sizeof(c), m->min_size),
2926                   format_bytes(d, sizeof(d), m->keep_free));
2927 }
2928
2929 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2930         assert(f);
2931         assert(from || to);
2932
2933         if (from) {
2934                 if (f->header->head_entry_realtime == 0)
2935                         return -ENOENT;
2936
2937                 *from = le64toh(f->header->head_entry_realtime);
2938         }
2939
2940         if (to) {
2941                 if (f->header->tail_entry_realtime == 0)
2942                         return -ENOENT;
2943
2944                 *to = le64toh(f->header->tail_entry_realtime);
2945         }
2946
2947         return 1;
2948 }
2949
2950 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2951         Object *o;
2952         uint64_t p;
2953         int r;
2954
2955         assert(f);
2956         assert(from || to);
2957
2958         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2959         if (r <= 0)
2960                 return r;
2961
2962         if (le64toh(o->data.n_entries) <= 0)
2963                 return 0;
2964
2965         if (from) {
2966                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2967                 if (r < 0)
2968                         return r;
2969
2970                 *from = le64toh(o->entry.monotonic);
2971         }
2972
2973         if (to) {
2974                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2975                 if (r < 0)
2976                         return r;
2977
2978                 r = generic_array_get_plus_one(f,
2979                                                le64toh(o->data.entry_offset),
2980                                                le64toh(o->data.entry_array_offset),
2981                                                le64toh(o->data.n_entries)-1,
2982                                                &o, NULL);
2983                 if (r <= 0)
2984                         return r;
2985
2986                 *to = le64toh(o->entry.monotonic);
2987         }
2988
2989         return 1;
2990 }
2991
2992 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2993         assert(f);
2994
2995         /* If we gained new header fields we gained new features,
2996          * hence suggest a rotation */
2997         if (le64toh(f->header->header_size) < sizeof(Header)) {
2998                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2999                 return true;
3000         }
3001
3002         /* Let's check if the hash tables grew over a certain fill
3003          * level (75%, borrowing this value from Java's hash table
3004          * implementation), and if so suggest a rotation. To calculate
3005          * the fill level we need the n_data field, which only exists
3006          * in newer versions. */
3007
3008         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3009                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3010                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3011                                   f->path,
3012                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3013                                   le64toh(f->header->n_data),
3014                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3015                                   (unsigned long long) f->last_stat.st_size,
3016                                   f->last_stat.st_size / le64toh(f->header->n_data));
3017                         return true;
3018                 }
3019
3020         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3021                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3022                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3023                                   f->path,
3024                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3025                                   le64toh(f->header->n_fields),
3026                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3027                         return true;
3028                 }
3029
3030         /* Are the data objects properly indexed by field objects? */
3031         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3032             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3033             le64toh(f->header->n_data) > 0 &&
3034             le64toh(f->header->n_fields) == 0)
3035                 return true;
3036
3037         if (max_file_usec > 0) {
3038                 usec_t t, h;
3039
3040                 h = le64toh(f->header->head_entry_realtime);
3041                 t = now(CLOCK_REALTIME);
3042
3043                 if (h > 0 && t > h + max_file_usec)
3044                         return true;
3045         }
3046
3047         return false;
3048 }