chiark / gitweb /
journal: compare candidate entries using JournalFiles' locations
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <sys/xattr.h>
30
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
34 #include "lookup3.h"
35 #include "compress.h"
36 #include "fsprg.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47  * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55  * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59  * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
70
71 static int journal_file_set_online(JournalFile *f) {
72         assert(f);
73
74         if (!f->writable)
75                 return -EPERM;
76
77         if (!(f->fd >= 0 && f->header))
78                 return -EINVAL;
79
80         switch(f->header->state) {
81                 case STATE_ONLINE:
82                         return 0;
83
84                 case STATE_OFFLINE:
85                         f->header->state = STATE_ONLINE;
86                         fsync(f->fd);
87                         return 0;
88
89                 default:
90                         return -EINVAL;
91         }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95         assert(f);
96
97         if (!f->writable)
98                 return -EPERM;
99
100         if (!(f->fd >= 0 && f->header))
101                 return -EINVAL;
102
103         if (f->header->state != STATE_ONLINE)
104                 return 0;
105
106         fsync(f->fd);
107
108         f->header->state = STATE_OFFLINE;
109
110         fsync(f->fd);
111
112         return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116         assert(f);
117
118 #ifdef HAVE_GCRYPT
119         /* Write the final tag */
120         if (f->seal && f->writable)
121                 journal_file_append_tag(f);
122 #endif
123
124         /* Sync everything to disk, before we mark the file offline */
125         if (f->mmap && f->fd >= 0)
126                 mmap_cache_close_fd(f->mmap, f->fd);
127
128         journal_file_set_offline(f);
129
130         if (f->header)
131                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133         safe_close(f->fd);
134         free(f->path);
135
136         if (f->mmap)
137                 mmap_cache_unref(f->mmap);
138
139         ordered_hashmap_free_free(f->chain_cache);
140
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142         free(f->compress_buffer);
143 #endif
144
145 #ifdef HAVE_GCRYPT
146         if (f->fss_file)
147                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148         else if (f->fsprg_state)
149                 free(f->fsprg_state);
150
151         free(f->fsprg_seed);
152
153         if (f->hmac)
154                 gcry_md_close(f->hmac);
155 #endif
156
157         free(f);
158 }
159
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
161         Header h = {};
162         ssize_t k;
163         int r;
164
165         assert(f);
166
167         memcpy(h.signature, HEADER_SIGNATURE, 8);
168         h.header_size = htole64(ALIGN64(sizeof(h)));
169
170         h.incompatible_flags |= htole32(
171                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
173
174         h.compatible_flags = htole32(
175                 f->seal * HEADER_COMPATIBLE_SEALED);
176
177         r = sd_id128_randomize(&h.file_id);
178         if (r < 0)
179                 return r;
180
181         if (template) {
182                 h.seqnum_id = template->header->seqnum_id;
183                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
184         } else
185                 h.seqnum_id = h.file_id;
186
187         k = pwrite(f->fd, &h, sizeof(h), 0);
188         if (k < 0)
189                 return -errno;
190
191         if (k != sizeof(h))
192                 return -EIO;
193
194         return 0;
195 }
196
197 static int journal_file_refresh_header(JournalFile *f) {
198         int r;
199         sd_id128_t boot_id;
200
201         assert(f);
202
203         r = sd_id128_get_machine(&f->header->machine_id);
204         if (r < 0)
205                 return r;
206
207         r = sd_id128_get_boot(&boot_id);
208         if (r < 0)
209                 return r;
210
211         if (sd_id128_equal(boot_id, f->header->boot_id))
212                 f->tail_entry_monotonic_valid = true;
213
214         f->header->boot_id = boot_id;
215
216         journal_file_set_online(f);
217
218         /* Sync the online state to disk */
219         fsync(f->fd);
220
221         return 0;
222 }
223
224 static int journal_file_verify_header(JournalFile *f) {
225         uint32_t flags;
226
227         assert(f);
228
229         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
230                 return -EBADMSG;
231
232         /* In both read and write mode we refuse to open files with
233          * incompatible flags we don't know */
234         flags = le32toh(f->header->incompatible_flags);
235         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240                 if (flags)
241                         log_debug("Journal file %s uses incompatible flags %"PRIx32
242                                   " disabled at compilation time.", f->path, flags);
243                 return -EPROTONOSUPPORT;
244         }
245
246         /* When open for writing we refuse to open files with
247          * compatible flags, too */
248         flags = le32toh(f->header->compatible_flags);
249         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250                 if (flags & ~HEADER_COMPATIBLE_ANY)
251                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
253                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254                 if (flags)
255                         log_debug("Journal file %s uses compatible flags %"PRIx32
256                                   " disabled at compilation time.", f->path, flags);
257                 return -EPROTONOSUPPORT;
258         }
259
260         if (f->header->state >= _STATE_MAX)
261                 return -EBADMSG;
262
263         /* The first addition was n_data, so check that we are at least this large */
264         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
265                 return -EBADMSG;
266
267         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
268                 return -EBADMSG;
269
270         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271                 return -ENODATA;
272
273         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274                 return -ENODATA;
275
276         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278             !VALID64(le64toh(f->header->tail_object_offset)) ||
279             !VALID64(le64toh(f->header->entry_array_offset)))
280                 return -ENODATA;
281
282         if (f->writable) {
283                 uint8_t state;
284                 sd_id128_t machine_id;
285                 int r;
286
287                 r = sd_id128_get_machine(&machine_id);
288                 if (r < 0)
289                         return r;
290
291                 if (!sd_id128_equal(machine_id, f->header->machine_id))
292                         return -EHOSTDOWN;
293
294                 state = f->header->state;
295
296                 if (state == STATE_ONLINE) {
297                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298                         return -EBUSY;
299                 } else if (state == STATE_ARCHIVED)
300                         return -ESHUTDOWN;
301                 else if (state != STATE_OFFLINE) {
302                         log_debug("Journal file %s has unknown state %u.", f->path, state);
303                         return -EBUSY;
304                 }
305         }
306
307         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
309
310         f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312         return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316         uint64_t old_size, new_size;
317         int r;
318
319         assert(f);
320
321         /* We assume that this file is not sparse, and we know that
322          * for sure, since we always call posix_fallocate()
323          * ourselves */
324
325         old_size =
326                 le64toh(f->header->header_size) +
327                 le64toh(f->header->arena_size);
328
329         new_size = PAGE_ALIGN(offset + size);
330         if (new_size < le64toh(f->header->header_size))
331                 new_size = le64toh(f->header->header_size);
332
333         if (new_size <= old_size)
334                 return 0;
335
336         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
337                 return -E2BIG;
338
339         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
340                 struct statvfs svfs;
341
342                 if (fstatvfs(f->fd, &svfs) >= 0) {
343                         uint64_t available;
344
345                         available = svfs.f_bfree * svfs.f_bsize;
346
347                         if (available >= f->metrics.keep_free)
348                                 available -= f->metrics.keep_free;
349                         else
350                                 available = 0;
351
352                         if (new_size - old_size > available)
353                                 return -E2BIG;
354                 }
355         }
356
357         /* Increase by larger blocks at once */
358         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360                 new_size = f->metrics.max_size;
361
362         /* Note that the glibc fallocate() fallback is very
363            inefficient, hence we try to minimize the allocation area
364            as we can. */
365         r = posix_fallocate(f->fd, old_size, new_size - old_size);
366         if (r != 0)
367                 return -r;
368
369         if (fstat(f->fd, &f->last_stat) < 0)
370                 return -errno;
371
372         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
373
374         return 0;
375 }
376
377 static unsigned type_to_context(ObjectType type) {
378         /* One context for each type, plus one catch-all for the rest */
379         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
380         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
381 }
382
383 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
384         assert(f);
385         assert(ret);
386
387         if (size <= 0)
388                 return -EINVAL;
389
390         /* Avoid SIGBUS on invalid accesses */
391         if (offset + size > (uint64_t) f->last_stat.st_size) {
392                 /* Hmm, out of range? Let's refresh the fstat() data
393                  * first, before we trust that check. */
394
395                 if (fstat(f->fd, &f->last_stat) < 0 ||
396                     offset + size > (uint64_t) f->last_stat.st_size)
397                         return -EADDRNOTAVAIL;
398         }
399
400         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
401 }
402
403 static uint64_t minimum_header_size(Object *o) {
404
405         static const uint64_t table[] = {
406                 [OBJECT_DATA] = sizeof(DataObject),
407                 [OBJECT_FIELD] = sizeof(FieldObject),
408                 [OBJECT_ENTRY] = sizeof(EntryObject),
409                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
410                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
411                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
412                 [OBJECT_TAG] = sizeof(TagObject),
413         };
414
415         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
416                 return sizeof(ObjectHeader);
417
418         return table[o->object.type];
419 }
420
421 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
422         int r;
423         void *t;
424         Object *o;
425         uint64_t s;
426
427         assert(f);
428         assert(ret);
429
430         /* Objects may only be located at multiple of 64 bit */
431         if (!VALID64(offset))
432                 return -EFAULT;
433
434         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
435         if (r < 0)
436                 return r;
437
438         o = (Object*) t;
439         s = le64toh(o->object.size);
440
441         if (s < sizeof(ObjectHeader))
442                 return -EBADMSG;
443
444         if (o->object.type <= OBJECT_UNUSED)
445                 return -EBADMSG;
446
447         if (s < minimum_header_size(o))
448                 return -EBADMSG;
449
450         if (type > OBJECT_UNUSED && o->object.type != type)
451                 return -EBADMSG;
452
453         if (s > sizeof(ObjectHeader)) {
454                 r = journal_file_move_to(f, type, false, offset, s, &t);
455                 if (r < 0)
456                         return r;
457
458                 o = (Object*) t;
459         }
460
461         *ret = o;
462         return 0;
463 }
464
465 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
466         uint64_t r;
467
468         assert(f);
469
470         r = le64toh(f->header->tail_entry_seqnum) + 1;
471
472         if (seqnum) {
473                 /* If an external seqnum counter was passed, we update
474                  * both the local and the external one, and set it to
475                  * the maximum of both */
476
477                 if (*seqnum + 1 > r)
478                         r = *seqnum + 1;
479
480                 *seqnum = r;
481         }
482
483         f->header->tail_entry_seqnum = htole64(r);
484
485         if (f->header->head_entry_seqnum == 0)
486                 f->header->head_entry_seqnum = htole64(r);
487
488         return r;
489 }
490
491 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
492         int r;
493         uint64_t p;
494         Object *tail, *o;
495         void *t;
496
497         assert(f);
498         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
499         assert(size >= sizeof(ObjectHeader));
500         assert(offset);
501         assert(ret);
502
503         r = journal_file_set_online(f);
504         if (r < 0)
505                 return r;
506
507         p = le64toh(f->header->tail_object_offset);
508         if (p == 0)
509                 p = le64toh(f->header->header_size);
510         else {
511                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
512                 if (r < 0)
513                         return r;
514
515                 p += ALIGN64(le64toh(tail->object.size));
516         }
517
518         r = journal_file_allocate(f, p, size);
519         if (r < 0)
520                 return r;
521
522         r = journal_file_move_to(f, type, false, p, size, &t);
523         if (r < 0)
524                 return r;
525
526         o = (Object*) t;
527
528         zero(o->object);
529         o->object.type = type;
530         o->object.size = htole64(size);
531
532         f->header->tail_object_offset = htole64(p);
533         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
534
535         *ret = o;
536         *offset = p;
537
538         return 0;
539 }
540
541 static int journal_file_setup_data_hash_table(JournalFile *f) {
542         uint64_t s, p;
543         Object *o;
544         int r;
545
546         assert(f);
547
548         /* We estimate that we need 1 hash table entry per 768 of
549            journal file and we want to make sure we never get beyond
550            75% fill level. Calculate the hash table size for the
551            maximum file size based on these metrics. */
552
553         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
554         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
556
557         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
558
559         r = journal_file_append_object(f,
560                                        OBJECT_DATA_HASH_TABLE,
561                                        offsetof(Object, hash_table.items) + s,
562                                        &o, &p);
563         if (r < 0)
564                 return r;
565
566         memzero(o->hash_table.items, s);
567
568         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569         f->header->data_hash_table_size = htole64(s);
570
571         return 0;
572 }
573
574 static int journal_file_setup_field_hash_table(JournalFile *f) {
575         uint64_t s, p;
576         Object *o;
577         int r;
578
579         assert(f);
580
581         /* We use a fixed size hash table for the fields as this
582          * number should grow very slowly only */
583
584         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585         r = journal_file_append_object(f,
586                                        OBJECT_FIELD_HASH_TABLE,
587                                        offsetof(Object, hash_table.items) + s,
588                                        &o, &p);
589         if (r < 0)
590                 return r;
591
592         memzero(o->hash_table.items, s);
593
594         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595         f->header->field_hash_table_size = htole64(s);
596
597         return 0;
598 }
599
600 static int journal_file_map_data_hash_table(JournalFile *f) {
601         uint64_t s, p;
602         void *t;
603         int r;
604
605         assert(f);
606
607         p = le64toh(f->header->data_hash_table_offset);
608         s = le64toh(f->header->data_hash_table_size);
609
610         r = journal_file_move_to(f,
611                                  OBJECT_DATA_HASH_TABLE,
612                                  true,
613                                  p, s,
614                                  &t);
615         if (r < 0)
616                 return r;
617
618         f->data_hash_table = t;
619         return 0;
620 }
621
622 static int journal_file_map_field_hash_table(JournalFile *f) {
623         uint64_t s, p;
624         void *t;
625         int r;
626
627         assert(f);
628
629         p = le64toh(f->header->field_hash_table_offset);
630         s = le64toh(f->header->field_hash_table_size);
631
632         r = journal_file_move_to(f,
633                                  OBJECT_FIELD_HASH_TABLE,
634                                  true,
635                                  p, s,
636                                  &t);
637         if (r < 0)
638                 return r;
639
640         f->field_hash_table = t;
641         return 0;
642 }
643
644 static int journal_file_link_field(
645                 JournalFile *f,
646                 Object *o,
647                 uint64_t offset,
648                 uint64_t hash) {
649
650         uint64_t p, h;
651         int r;
652
653         assert(f);
654         assert(o);
655         assert(offset > 0);
656
657         if (o->object.type != OBJECT_FIELD)
658                 return -EINVAL;
659
660         /* This might alter the window we are looking at */
661
662         o->field.next_hash_offset = o->field.head_data_offset = 0;
663
664         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665         p = le64toh(f->field_hash_table[h].tail_hash_offset);
666         if (p == 0)
667                 f->field_hash_table[h].head_hash_offset = htole64(offset);
668         else {
669                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
670                 if (r < 0)
671                         return r;
672
673                 o->field.next_hash_offset = htole64(offset);
674         }
675
676         f->field_hash_table[h].tail_hash_offset = htole64(offset);
677
678         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
680
681         return 0;
682 }
683
684 static int journal_file_link_data(
685                 JournalFile *f,
686                 Object *o,
687                 uint64_t offset,
688                 uint64_t hash) {
689
690         uint64_t p, h;
691         int r;
692
693         assert(f);
694         assert(o);
695         assert(offset > 0);
696
697         if (o->object.type != OBJECT_DATA)
698                 return -EINVAL;
699
700         /* This might alter the window we are looking at */
701
702         o->data.next_hash_offset = o->data.next_field_offset = 0;
703         o->data.entry_offset = o->data.entry_array_offset = 0;
704         o->data.n_entries = 0;
705
706         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
707         p = le64toh(f->data_hash_table[h].tail_hash_offset);
708         if (p == 0)
709                 /* Only entry in the hash table is easy */
710                 f->data_hash_table[h].head_hash_offset = htole64(offset);
711         else {
712                 /* Move back to the previous data object, to patch in
713                  * pointer */
714
715                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
716                 if (r < 0)
717                         return r;
718
719                 o->data.next_hash_offset = htole64(offset);
720         }
721
722         f->data_hash_table[h].tail_hash_offset = htole64(offset);
723
724         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
726
727         return 0;
728 }
729
730 int journal_file_find_field_object_with_hash(
731                 JournalFile *f,
732                 const void *field, uint64_t size, uint64_t hash,
733                 Object **ret, uint64_t *offset) {
734
735         uint64_t p, osize, h;
736         int r;
737
738         assert(f);
739         assert(field && size > 0);
740
741         osize = offsetof(Object, field.payload) + size;
742
743         if (f->header->field_hash_table_size == 0)
744                 return -EBADMSG;
745
746         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747         p = le64toh(f->field_hash_table[h].head_hash_offset);
748
749         while (p > 0) {
750                 Object *o;
751
752                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
753                 if (r < 0)
754                         return r;
755
756                 if (le64toh(o->field.hash) == hash &&
757                     le64toh(o->object.size) == osize &&
758                     memcmp(o->field.payload, field, size) == 0) {
759
760                         if (ret)
761                                 *ret = o;
762                         if (offset)
763                                 *offset = p;
764
765                         return 1;
766                 }
767
768                 p = le64toh(o->field.next_hash_offset);
769         }
770
771         return 0;
772 }
773
774 int journal_file_find_field_object(
775                 JournalFile *f,
776                 const void *field, uint64_t size,
777                 Object **ret, uint64_t *offset) {
778
779         uint64_t hash;
780
781         assert(f);
782         assert(field && size > 0);
783
784         hash = hash64(field, size);
785
786         return journal_file_find_field_object_with_hash(f,
787                                                         field, size, hash,
788                                                         ret, offset);
789 }
790
791 int journal_file_find_data_object_with_hash(
792                 JournalFile *f,
793                 const void *data, uint64_t size, uint64_t hash,
794                 Object **ret, uint64_t *offset) {
795
796         uint64_t p, osize, h;
797         int r;
798
799         assert(f);
800         assert(data || size == 0);
801
802         osize = offsetof(Object, data.payload) + size;
803
804         if (f->header->data_hash_table_size == 0)
805                 return -EBADMSG;
806
807         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808         p = le64toh(f->data_hash_table[h].head_hash_offset);
809
810         while (p > 0) {
811                 Object *o;
812
813                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
814                 if (r < 0)
815                         return r;
816
817                 if (le64toh(o->data.hash) != hash)
818                         goto next;
819
820                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
821 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
822                         uint64_t l;
823                         size_t rsize;
824
825                         l = le64toh(o->object.size);
826                         if (l <= offsetof(Object, data.payload))
827                                 return -EBADMSG;
828
829                         l -= offsetof(Object, data.payload);
830
831                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
832                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
833                         if (r < 0)
834                                 return r;
835
836                         if (rsize == size &&
837                             memcmp(f->compress_buffer, data, size) == 0) {
838
839                                 if (ret)
840                                         *ret = o;
841
842                                 if (offset)
843                                         *offset = p;
844
845                                 return 1;
846                         }
847 #else
848                         return -EPROTONOSUPPORT;
849 #endif
850                 } else if (le64toh(o->object.size) == osize &&
851                            memcmp(o->data.payload, data, size) == 0) {
852
853                         if (ret)
854                                 *ret = o;
855
856                         if (offset)
857                                 *offset = p;
858
859                         return 1;
860                 }
861
862         next:
863                 p = le64toh(o->data.next_hash_offset);
864         }
865
866         return 0;
867 }
868
869 int journal_file_find_data_object(
870                 JournalFile *f,
871                 const void *data, uint64_t size,
872                 Object **ret, uint64_t *offset) {
873
874         uint64_t hash;
875
876         assert(f);
877         assert(data || size == 0);
878
879         hash = hash64(data, size);
880
881         return journal_file_find_data_object_with_hash(f,
882                                                        data, size, hash,
883                                                        ret, offset);
884 }
885
886 static int journal_file_append_field(
887                 JournalFile *f,
888                 const void *field, uint64_t size,
889                 Object **ret, uint64_t *offset) {
890
891         uint64_t hash, p;
892         uint64_t osize;
893         Object *o;
894         int r;
895
896         assert(f);
897         assert(field && size > 0);
898
899         hash = hash64(field, size);
900
901         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
902         if (r < 0)
903                 return r;
904         else if (r > 0) {
905
906                 if (ret)
907                         *ret = o;
908
909                 if (offset)
910                         *offset = p;
911
912                 return 0;
913         }
914
915         osize = offsetof(Object, field.payload) + size;
916         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
917         if (r < 0)
918                 return r;
919
920         o->field.hash = htole64(hash);
921         memcpy(o->field.payload, field, size);
922
923         r = journal_file_link_field(f, o, p, hash);
924         if (r < 0)
925                 return r;
926
927         /* The linking might have altered the window, so let's
928          * refresh our pointer */
929         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
930         if (r < 0)
931                 return r;
932
933 #ifdef HAVE_GCRYPT
934         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
935         if (r < 0)
936                 return r;
937 #endif
938
939         if (ret)
940                 *ret = o;
941
942         if (offset)
943                 *offset = p;
944
945         return 0;
946 }
947
948 static int journal_file_append_data(
949                 JournalFile *f,
950                 const void *data, uint64_t size,
951                 Object **ret, uint64_t *offset) {
952
953         uint64_t hash, p;
954         uint64_t osize;
955         Object *o;
956         int r, compression = 0;
957         const void *eq;
958
959         assert(f);
960         assert(data || size == 0);
961
962         hash = hash64(data, size);
963
964         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
965         if (r < 0)
966                 return r;
967         else if (r > 0) {
968
969                 if (ret)
970                         *ret = o;
971
972                 if (offset)
973                         *offset = p;
974
975                 return 0;
976         }
977
978         osize = offsetof(Object, data.payload) + size;
979         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
980         if (r < 0)
981                 return r;
982
983         o->data.hash = htole64(hash);
984
985 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
986         if (f->compress_xz &&
987             size >= COMPRESSION_SIZE_THRESHOLD) {
988                 size_t rsize;
989
990                 compression = compress_blob(data, size, o->data.payload, &rsize);
991
992                 if (compression) {
993                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
994                         o->object.flags |= compression;
995
996                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
997                                   size, rsize, object_compressed_to_string(compression));
998                 }
999         }
1000 #endif
1001
1002         if (!compression && size > 0)
1003                 memcpy(o->data.payload, data, size);
1004
1005         r = journal_file_link_data(f, o, p, hash);
1006         if (r < 0)
1007                 return r;
1008
1009         /* The linking might have altered the window, so let's
1010          * refresh our pointer */
1011         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1012         if (r < 0)
1013                 return r;
1014
1015         if (!data)
1016                 eq = NULL;
1017         else
1018                 eq = memchr(data, '=', size);
1019         if (eq && eq > data) {
1020                 Object *fo = NULL;
1021                 uint64_t fp;
1022
1023                 /* Create field object ... */
1024                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1025                 if (r < 0)
1026                         return r;
1027
1028                 /* ... and link it in. */
1029                 o->data.next_field_offset = fo->field.head_data_offset;
1030                 fo->field.head_data_offset = le64toh(p);
1031         }
1032
1033 #ifdef HAVE_GCRYPT
1034         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1035         if (r < 0)
1036                 return r;
1037 #endif
1038
1039         if (ret)
1040                 *ret = o;
1041
1042         if (offset)
1043                 *offset = p;
1044
1045         return 0;
1046 }
1047
1048 uint64_t journal_file_entry_n_items(Object *o) {
1049         assert(o);
1050
1051         if (o->object.type != OBJECT_ENTRY)
1052                 return 0;
1053
1054         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1055 }
1056
1057 uint64_t journal_file_entry_array_n_items(Object *o) {
1058         assert(o);
1059
1060         if (o->object.type != OBJECT_ENTRY_ARRAY)
1061                 return 0;
1062
1063         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1064 }
1065
1066 uint64_t journal_file_hash_table_n_items(Object *o) {
1067         assert(o);
1068
1069         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1070             o->object.type != OBJECT_FIELD_HASH_TABLE)
1071                 return 0;
1072
1073         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1074 }
1075
1076 static int link_entry_into_array(JournalFile *f,
1077                                  le64_t *first,
1078                                  le64_t *idx,
1079                                  uint64_t p) {
1080         int r;
1081         uint64_t n = 0, ap = 0, q, i, a, hidx;
1082         Object *o;
1083
1084         assert(f);
1085         assert(first);
1086         assert(idx);
1087         assert(p > 0);
1088
1089         a = le64toh(*first);
1090         i = hidx = le64toh(*idx);
1091         while (a > 0) {
1092
1093                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1094                 if (r < 0)
1095                         return r;
1096
1097                 n = journal_file_entry_array_n_items(o);
1098                 if (i < n) {
1099                         o->entry_array.items[i] = htole64(p);
1100                         *idx = htole64(hidx + 1);
1101                         return 0;
1102                 }
1103
1104                 i -= n;
1105                 ap = a;
1106                 a = le64toh(o->entry_array.next_entry_array_offset);
1107         }
1108
1109         if (hidx > n)
1110                 n = (hidx+1) * 2;
1111         else
1112                 n = n * 2;
1113
1114         if (n < 4)
1115                 n = 4;
1116
1117         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1118                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1119                                        &o, &q);
1120         if (r < 0)
1121                 return r;
1122
1123 #ifdef HAVE_GCRYPT
1124         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1125         if (r < 0)
1126                 return r;
1127 #endif
1128
1129         o->entry_array.items[i] = htole64(p);
1130
1131         if (ap == 0)
1132                 *first = htole64(q);
1133         else {
1134                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1135                 if (r < 0)
1136                         return r;
1137
1138                 o->entry_array.next_entry_array_offset = htole64(q);
1139         }
1140
1141         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1142                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1143
1144         *idx = htole64(hidx + 1);
1145
1146         return 0;
1147 }
1148
1149 static int link_entry_into_array_plus_one(JournalFile *f,
1150                                           le64_t *extra,
1151                                           le64_t *first,
1152                                           le64_t *idx,
1153                                           uint64_t p) {
1154
1155         int r;
1156
1157         assert(f);
1158         assert(extra);
1159         assert(first);
1160         assert(idx);
1161         assert(p > 0);
1162
1163         if (*idx == 0)
1164                 *extra = htole64(p);
1165         else {
1166                 le64_t i;
1167
1168                 i = htole64(le64toh(*idx) - 1);
1169                 r = link_entry_into_array(f, first, &i, p);
1170                 if (r < 0)
1171                         return r;
1172         }
1173
1174         *idx = htole64(le64toh(*idx) + 1);
1175         return 0;
1176 }
1177
1178 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1179         uint64_t p;
1180         int r;
1181         assert(f);
1182         assert(o);
1183         assert(offset > 0);
1184
1185         p = le64toh(o->entry.items[i].object_offset);
1186         if (p == 0)
1187                 return -EINVAL;
1188
1189         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1190         if (r < 0)
1191                 return r;
1192
1193         return link_entry_into_array_plus_one(f,
1194                                               &o->data.entry_offset,
1195                                               &o->data.entry_array_offset,
1196                                               &o->data.n_entries,
1197                                               offset);
1198 }
1199
1200 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1201         uint64_t n, i;
1202         int r;
1203
1204         assert(f);
1205         assert(o);
1206         assert(offset > 0);
1207
1208         if (o->object.type != OBJECT_ENTRY)
1209                 return -EINVAL;
1210
1211         __sync_synchronize();
1212
1213         /* Link up the entry itself */
1214         r = link_entry_into_array(f,
1215                                   &f->header->entry_array_offset,
1216                                   &f->header->n_entries,
1217                                   offset);
1218         if (r < 0)
1219                 return r;
1220
1221         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1222
1223         if (f->header->head_entry_realtime == 0)
1224                 f->header->head_entry_realtime = o->entry.realtime;
1225
1226         f->header->tail_entry_realtime = o->entry.realtime;
1227         f->header->tail_entry_monotonic = o->entry.monotonic;
1228
1229         f->tail_entry_monotonic_valid = true;
1230
1231         /* Link up the items */
1232         n = journal_file_entry_n_items(o);
1233         for (i = 0; i < n; i++) {
1234                 r = journal_file_link_entry_item(f, o, offset, i);
1235                 if (r < 0)
1236                         return r;
1237         }
1238
1239         return 0;
1240 }
1241
1242 static int journal_file_append_entry_internal(
1243                 JournalFile *f,
1244                 const dual_timestamp *ts,
1245                 uint64_t xor_hash,
1246                 const EntryItem items[], unsigned n_items,
1247                 uint64_t *seqnum,
1248                 Object **ret, uint64_t *offset) {
1249         uint64_t np;
1250         uint64_t osize;
1251         Object *o;
1252         int r;
1253
1254         assert(f);
1255         assert(items || n_items == 0);
1256         assert(ts);
1257
1258         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1259
1260         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1261         if (r < 0)
1262                 return r;
1263
1264         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1265         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1266         o->entry.realtime = htole64(ts->realtime);
1267         o->entry.monotonic = htole64(ts->monotonic);
1268         o->entry.xor_hash = htole64(xor_hash);
1269         o->entry.boot_id = f->header->boot_id;
1270
1271 #ifdef HAVE_GCRYPT
1272         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1273         if (r < 0)
1274                 return r;
1275 #endif
1276
1277         r = journal_file_link_entry(f, o, np);
1278         if (r < 0)
1279                 return r;
1280
1281         if (ret)
1282                 *ret = o;
1283
1284         if (offset)
1285                 *offset = np;
1286
1287         return 0;
1288 }
1289
1290 void journal_file_post_change(JournalFile *f) {
1291         assert(f);
1292
1293         /* inotify() does not receive IN_MODIFY events from file
1294          * accesses done via mmap(). After each access we hence
1295          * trigger IN_MODIFY by truncating the journal file to its
1296          * current size which triggers IN_MODIFY. */
1297
1298         __sync_synchronize();
1299
1300         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1301                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1302 }
1303
1304 static int entry_item_cmp(const void *_a, const void *_b) {
1305         const EntryItem *a = _a, *b = _b;
1306
1307         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1308                 return -1;
1309         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1310                 return 1;
1311         return 0;
1312 }
1313
1314 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1315         unsigned i;
1316         EntryItem *items;
1317         int r;
1318         uint64_t xor_hash = 0;
1319         struct dual_timestamp _ts;
1320
1321         assert(f);
1322         assert(iovec || n_iovec == 0);
1323
1324         if (!ts) {
1325                 dual_timestamp_get(&_ts);
1326                 ts = &_ts;
1327         }
1328
1329         if (f->tail_entry_monotonic_valid &&
1330             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1331                 return -EINVAL;
1332
1333 #ifdef HAVE_GCRYPT
1334         r = journal_file_maybe_append_tag(f, ts->realtime);
1335         if (r < 0)
1336                 return r;
1337 #endif
1338
1339         /* alloca() can't take 0, hence let's allocate at least one */
1340         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1341
1342         for (i = 0; i < n_iovec; i++) {
1343                 uint64_t p;
1344                 Object *o;
1345
1346                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1347                 if (r < 0)
1348                         return r;
1349
1350                 xor_hash ^= le64toh(o->data.hash);
1351                 items[i].object_offset = htole64(p);
1352                 items[i].hash = o->data.hash;
1353         }
1354
1355         /* Order by the position on disk, in order to improve seek
1356          * times for rotating media. */
1357         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1358
1359         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1360
1361         journal_file_post_change(f);
1362
1363         return r;
1364 }
1365
1366 typedef struct ChainCacheItem {
1367         uint64_t first; /* the array at the beginning of the chain */
1368         uint64_t array; /* the cached array */
1369         uint64_t begin; /* the first item in the cached array */
1370         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1371         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1372 } ChainCacheItem;
1373
1374 static void chain_cache_put(
1375                 OrderedHashmap *h,
1376                 ChainCacheItem *ci,
1377                 uint64_t first,
1378                 uint64_t array,
1379                 uint64_t begin,
1380                 uint64_t total,
1381                 uint64_t last_index) {
1382
1383         if (!ci) {
1384                 /* If the chain item to cache for this chain is the
1385                  * first one it's not worth caching anything */
1386                 if (array == first)
1387                         return;
1388
1389                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1390                         ci = ordered_hashmap_steal_first(h);
1391                         assert(ci);
1392                 } else {
1393                         ci = new(ChainCacheItem, 1);
1394                         if (!ci)
1395                                 return;
1396                 }
1397
1398                 ci->first = first;
1399
1400                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1401                         free(ci);
1402                         return;
1403                 }
1404         } else
1405                 assert(ci->first == first);
1406
1407         ci->array = array;
1408         ci->begin = begin;
1409         ci->total = total;
1410         ci->last_index = last_index;
1411 }
1412
1413 static int generic_array_get(
1414                 JournalFile *f,
1415                 uint64_t first,
1416                 uint64_t i,
1417                 Object **ret, uint64_t *offset) {
1418
1419         Object *o;
1420         uint64_t p = 0, a, t = 0;
1421         int r;
1422         ChainCacheItem *ci;
1423
1424         assert(f);
1425
1426         a = first;
1427
1428         /* Try the chain cache first */
1429         ci = ordered_hashmap_get(f->chain_cache, &first);
1430         if (ci && i > ci->total) {
1431                 a = ci->array;
1432                 i -= ci->total;
1433                 t = ci->total;
1434         }
1435
1436         while (a > 0) {
1437                 uint64_t k;
1438
1439                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1440                 if (r < 0)
1441                         return r;
1442
1443                 k = journal_file_entry_array_n_items(o);
1444                 if (i < k) {
1445                         p = le64toh(o->entry_array.items[i]);
1446                         goto found;
1447                 }
1448
1449                 i -= k;
1450                 t += k;
1451                 a = le64toh(o->entry_array.next_entry_array_offset);
1452         }
1453
1454         return 0;
1455
1456 found:
1457         /* Let's cache this item for the next invocation */
1458         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1459
1460         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1461         if (r < 0)
1462                 return r;
1463
1464         if (ret)
1465                 *ret = o;
1466
1467         if (offset)
1468                 *offset = p;
1469
1470         return 1;
1471 }
1472
1473 static int generic_array_get_plus_one(
1474                 JournalFile *f,
1475                 uint64_t extra,
1476                 uint64_t first,
1477                 uint64_t i,
1478                 Object **ret, uint64_t *offset) {
1479
1480         Object *o;
1481
1482         assert(f);
1483
1484         if (i == 0) {
1485                 int r;
1486
1487                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1488                 if (r < 0)
1489                         return r;
1490
1491                 if (ret)
1492                         *ret = o;
1493
1494                 if (offset)
1495                         *offset = extra;
1496
1497                 return 1;
1498         }
1499
1500         return generic_array_get(f, first, i-1, ret, offset);
1501 }
1502
1503 enum {
1504         TEST_FOUND,
1505         TEST_LEFT,
1506         TEST_RIGHT
1507 };
1508
1509 static int generic_array_bisect(
1510                 JournalFile *f,
1511                 uint64_t first,
1512                 uint64_t n,
1513                 uint64_t needle,
1514                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1515                 direction_t direction,
1516                 Object **ret,
1517                 uint64_t *offset,
1518                 uint64_t *idx) {
1519
1520         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1521         bool subtract_one = false;
1522         Object *o, *array = NULL;
1523         int r;
1524         ChainCacheItem *ci;
1525
1526         assert(f);
1527         assert(test_object);
1528
1529         /* Start with the first array in the chain */
1530         a = first;
1531
1532         ci = ordered_hashmap_get(f->chain_cache, &first);
1533         if (ci && n > ci->total) {
1534                 /* Ah, we have iterated this bisection array chain
1535                  * previously! Let's see if we can skip ahead in the
1536                  * chain, as far as the last time. But we can't jump
1537                  * backwards in the chain, so let's check that
1538                  * first. */
1539
1540                 r = test_object(f, ci->begin, needle);
1541                 if (r < 0)
1542                         return r;
1543
1544                 if (r == TEST_LEFT) {
1545                         /* OK, what we are looking for is right of the
1546                          * begin of this EntryArray, so let's jump
1547                          * straight to previously cached array in the
1548                          * chain */
1549
1550                         a = ci->array;
1551                         n -= ci->total;
1552                         t = ci->total;
1553                         last_index = ci->last_index;
1554                 }
1555         }
1556
1557         while (a > 0) {
1558                 uint64_t left, right, k, lp;
1559
1560                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1561                 if (r < 0)
1562                         return r;
1563
1564                 k = journal_file_entry_array_n_items(array);
1565                 right = MIN(k, n);
1566                 if (right <= 0)
1567                         return 0;
1568
1569                 i = right - 1;
1570                 lp = p = le64toh(array->entry_array.items[i]);
1571                 if (p <= 0)
1572                         return -EBADMSG;
1573
1574                 r = test_object(f, p, needle);
1575                 if (r < 0)
1576                         return r;
1577
1578                 if (r == TEST_FOUND)
1579                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1580
1581                 if (r == TEST_RIGHT) {
1582                         left = 0;
1583                         right -= 1;
1584
1585                         if (last_index != (uint64_t) -1) {
1586                                 assert(last_index <= right);
1587
1588                                 /* If we cached the last index we
1589                                  * looked at, let's try to not to jump
1590                                  * too wildly around and see if we can
1591                                  * limit the range to look at early to
1592                                  * the immediate neighbors of the last
1593                                  * index we looked at. */
1594
1595                                 if (last_index > 0) {
1596                                         uint64_t x = last_index - 1;
1597
1598                                         p = le64toh(array->entry_array.items[x]);
1599                                         if (p <= 0)
1600                                                 return -EBADMSG;
1601
1602                                         r = test_object(f, p, needle);
1603                                         if (r < 0)
1604                                                 return r;
1605
1606                                         if (r == TEST_FOUND)
1607                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1608
1609                                         if (r == TEST_RIGHT)
1610                                                 right = x;
1611                                         else
1612                                                 left = x + 1;
1613                                 }
1614
1615                                 if (last_index < right) {
1616                                         uint64_t y = last_index + 1;
1617
1618                                         p = le64toh(array->entry_array.items[y]);
1619                                         if (p <= 0)
1620                                                 return -EBADMSG;
1621
1622                                         r = test_object(f, p, needle);
1623                                         if (r < 0)
1624                                                 return r;
1625
1626                                         if (r == TEST_FOUND)
1627                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1628
1629                                         if (r == TEST_RIGHT)
1630                                                 right = y;
1631                                         else
1632                                                 left = y + 1;
1633                                 }
1634                         }
1635
1636                         for (;;) {
1637                                 if (left == right) {
1638                                         if (direction == DIRECTION_UP)
1639                                                 subtract_one = true;
1640
1641                                         i = left;
1642                                         goto found;
1643                                 }
1644
1645                                 assert(left < right);
1646                                 i = (left + right) / 2;
1647
1648                                 p = le64toh(array->entry_array.items[i]);
1649                                 if (p <= 0)
1650                                         return -EBADMSG;
1651
1652                                 r = test_object(f, p, needle);
1653                                 if (r < 0)
1654                                         return r;
1655
1656                                 if (r == TEST_FOUND)
1657                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1658
1659                                 if (r == TEST_RIGHT)
1660                                         right = i;
1661                                 else
1662                                         left = i + 1;
1663                         }
1664                 }
1665
1666                 if (k >= n) {
1667                         if (direction == DIRECTION_UP) {
1668                                 i = n;
1669                                 subtract_one = true;
1670                                 goto found;
1671                         }
1672
1673                         return 0;
1674                 }
1675
1676                 last_p = lp;
1677
1678                 n -= k;
1679                 t += k;
1680                 last_index = (uint64_t) -1;
1681                 a = le64toh(array->entry_array.next_entry_array_offset);
1682         }
1683
1684         return 0;
1685
1686 found:
1687         if (subtract_one && t == 0 && i == 0)
1688                 return 0;
1689
1690         /* Let's cache this item for the next invocation */
1691         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1692
1693         if (subtract_one && i == 0)
1694                 p = last_p;
1695         else if (subtract_one)
1696                 p = le64toh(array->entry_array.items[i-1]);
1697         else
1698                 p = le64toh(array->entry_array.items[i]);
1699
1700         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1701         if (r < 0)
1702                 return r;
1703
1704         if (ret)
1705                 *ret = o;
1706
1707         if (offset)
1708                 *offset = p;
1709
1710         if (idx)
1711                 *idx = t + i + (subtract_one ? -1 : 0);
1712
1713         return 1;
1714 }
1715
1716
1717 static int generic_array_bisect_plus_one(
1718                 JournalFile *f,
1719                 uint64_t extra,
1720                 uint64_t first,
1721                 uint64_t n,
1722                 uint64_t needle,
1723                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1724                 direction_t direction,
1725                 Object **ret,
1726                 uint64_t *offset,
1727                 uint64_t *idx) {
1728
1729         int r;
1730         bool step_back = false;
1731         Object *o;
1732
1733         assert(f);
1734         assert(test_object);
1735
1736         if (n <= 0)
1737                 return 0;
1738
1739         /* This bisects the array in object 'first', but first checks
1740          * an extra  */
1741         r = test_object(f, extra, needle);
1742         if (r < 0)
1743                 return r;
1744
1745         if (r == TEST_FOUND)
1746                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1747
1748         /* if we are looking with DIRECTION_UP then we need to first
1749            see if in the actual array there is a matching entry, and
1750            return the last one of that. But if there isn't any we need
1751            to return this one. Hence remember this, and return it
1752            below. */
1753         if (r == TEST_LEFT)
1754                 step_back = direction == DIRECTION_UP;
1755
1756         if (r == TEST_RIGHT) {
1757                 if (direction == DIRECTION_DOWN)
1758                         goto found;
1759                 else
1760                         return 0;
1761         }
1762
1763         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1764
1765         if (r == 0 && step_back)
1766                 goto found;
1767
1768         if (r > 0 && idx)
1769                 (*idx) ++;
1770
1771         return r;
1772
1773 found:
1774         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1775         if (r < 0)
1776                 return r;
1777
1778         if (ret)
1779                 *ret = o;
1780
1781         if (offset)
1782                 *offset = extra;
1783
1784         if (idx)
1785                 *idx = 0;
1786
1787         return 1;
1788 }
1789
1790 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1791         assert(f);
1792         assert(p > 0);
1793
1794         if (p == needle)
1795                 return TEST_FOUND;
1796         else if (p < needle)
1797                 return TEST_LEFT;
1798         else
1799                 return TEST_RIGHT;
1800 }
1801
1802 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1803         Object *o;
1804         int r;
1805
1806         assert(f);
1807         assert(p > 0);
1808
1809         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1810         if (r < 0)
1811                 return r;
1812
1813         if (le64toh(o->entry.seqnum) == needle)
1814                 return TEST_FOUND;
1815         else if (le64toh(o->entry.seqnum) < needle)
1816                 return TEST_LEFT;
1817         else
1818                 return TEST_RIGHT;
1819 }
1820
1821 int journal_file_move_to_entry_by_seqnum(
1822                 JournalFile *f,
1823                 uint64_t seqnum,
1824                 direction_t direction,
1825                 Object **ret,
1826                 uint64_t *offset) {
1827
1828         return generic_array_bisect(f,
1829                                     le64toh(f->header->entry_array_offset),
1830                                     le64toh(f->header->n_entries),
1831                                     seqnum,
1832                                     test_object_seqnum,
1833                                     direction,
1834                                     ret, offset, NULL);
1835 }
1836
1837 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1838         Object *o;
1839         int r;
1840
1841         assert(f);
1842         assert(p > 0);
1843
1844         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1845         if (r < 0)
1846                 return r;
1847
1848         if (le64toh(o->entry.realtime) == needle)
1849                 return TEST_FOUND;
1850         else if (le64toh(o->entry.realtime) < needle)
1851                 return TEST_LEFT;
1852         else
1853                 return TEST_RIGHT;
1854 }
1855
1856 int journal_file_move_to_entry_by_realtime(
1857                 JournalFile *f,
1858                 uint64_t realtime,
1859                 direction_t direction,
1860                 Object **ret,
1861                 uint64_t *offset) {
1862
1863         return generic_array_bisect(f,
1864                                     le64toh(f->header->entry_array_offset),
1865                                     le64toh(f->header->n_entries),
1866                                     realtime,
1867                                     test_object_realtime,
1868                                     direction,
1869                                     ret, offset, NULL);
1870 }
1871
1872 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1873         Object *o;
1874         int r;
1875
1876         assert(f);
1877         assert(p > 0);
1878
1879         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1880         if (r < 0)
1881                 return r;
1882
1883         if (le64toh(o->entry.monotonic) == needle)
1884                 return TEST_FOUND;
1885         else if (le64toh(o->entry.monotonic) < needle)
1886                 return TEST_LEFT;
1887         else
1888                 return TEST_RIGHT;
1889 }
1890
1891 static inline int find_data_object_by_boot_id(
1892                 JournalFile *f,
1893                 sd_id128_t boot_id,
1894                 Object **o,
1895                 uint64_t *b) {
1896         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1897
1898         sd_id128_to_string(boot_id, t + 9);
1899         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1900 }
1901
1902 int journal_file_move_to_entry_by_monotonic(
1903                 JournalFile *f,
1904                 sd_id128_t boot_id,
1905                 uint64_t monotonic,
1906                 direction_t direction,
1907                 Object **ret,
1908                 uint64_t *offset) {
1909
1910         Object *o;
1911         int r;
1912
1913         assert(f);
1914
1915         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1916         if (r < 0)
1917                 return r;
1918         if (r == 0)
1919                 return -ENOENT;
1920
1921         return generic_array_bisect_plus_one(f,
1922                                              le64toh(o->data.entry_offset),
1923                                              le64toh(o->data.entry_array_offset),
1924                                              le64toh(o->data.n_entries),
1925                                              monotonic,
1926                                              test_object_monotonic,
1927                                              direction,
1928                                              ret, offset, NULL);
1929 }
1930
1931 void journal_file_reset_location(JournalFile *f) {
1932         f->location_type = LOCATION_HEAD;
1933         f->current_offset = 0;
1934         f->current_seqnum = 0;
1935         f->current_realtime = 0;
1936         f->current_monotonic = 0;
1937         zero(f->current_boot_id);
1938         f->current_xor_hash = 0;
1939 }
1940
1941 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1942         f->last_direction = direction;
1943         f->location_type = LOCATION_SEEK;
1944         f->current_offset = offset;
1945         f->current_seqnum = le64toh(o->entry.seqnum);
1946         f->current_realtime = le64toh(o->entry.realtime);
1947         f->current_monotonic = le64toh(o->entry.monotonic);
1948         f->current_boot_id = o->entry.boot_id;
1949         f->current_xor_hash = le64toh(o->entry.xor_hash);
1950 }
1951
1952 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1953         assert(af);
1954         assert(bf);
1955         assert(af->location_type == LOCATION_SEEK);
1956         assert(bf->location_type == LOCATION_SEEK);
1957
1958         /* If contents and timestamps match, these entries are
1959          * identical, even if the seqnum does not match */
1960         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1961             af->current_monotonic == bf->current_monotonic &&
1962             af->current_realtime == bf->current_realtime &&
1963             af->current_xor_hash == bf->current_xor_hash)
1964                 return 0;
1965
1966         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1967
1968                 /* If this is from the same seqnum source, compare
1969                  * seqnums */
1970                 if (af->current_seqnum < bf->current_seqnum)
1971                         return -1;
1972                 if (af->current_seqnum > bf->current_seqnum)
1973                         return 1;
1974
1975                 /* Wow! This is weird, different data but the same
1976                  * seqnums? Something is borked, but let's make the
1977                  * best of it and compare by time. */
1978         }
1979
1980         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
1981
1982                 /* If the boot id matches, compare monotonic time */
1983                 if (af->current_monotonic < bf->current_monotonic)
1984                         return -1;
1985                 if (af->current_monotonic > bf->current_monotonic)
1986                         return 1;
1987         }
1988
1989         /* Otherwise, compare UTC time */
1990         if (af->current_realtime < bf->current_realtime)
1991                 return -1;
1992         if (af->current_realtime > bf->current_realtime)
1993                 return 1;
1994
1995         /* Finally, compare by contents */
1996         if (af->current_xor_hash < bf->current_xor_hash)
1997                 return -1;
1998         if (af->current_xor_hash > bf->current_xor_hash)
1999                 return 1;
2000
2001         return 0;
2002 }
2003
2004 int journal_file_next_entry(
2005                 JournalFile *f,
2006                 Object *o, uint64_t p,
2007                 direction_t direction,
2008                 Object **ret, uint64_t *offset) {
2009
2010         uint64_t i, n, ofs;
2011         int r;
2012
2013         assert(f);
2014         assert(p > 0 || !o);
2015
2016         n = le64toh(f->header->n_entries);
2017         if (n <= 0)
2018                 return 0;
2019
2020         if (!o)
2021                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2022         else {
2023                 if (o->object.type != OBJECT_ENTRY)
2024                         return -EINVAL;
2025
2026                 r = generic_array_bisect(f,
2027                                          le64toh(f->header->entry_array_offset),
2028                                          le64toh(f->header->n_entries),
2029                                          p,
2030                                          test_object_offset,
2031                                          DIRECTION_DOWN,
2032                                          NULL, NULL,
2033                                          &i);
2034                 if (r <= 0)
2035                         return r;
2036
2037                 if (direction == DIRECTION_DOWN) {
2038                         if (i >= n - 1)
2039                                 return 0;
2040
2041                         i++;
2042                 } else {
2043                         if (i <= 0)
2044                                 return 0;
2045
2046                         i--;
2047                 }
2048         }
2049
2050         /* And jump to it */
2051         r = generic_array_get(f,
2052                               le64toh(f->header->entry_array_offset),
2053                               i,
2054                               ret, &ofs);
2055         if (r <= 0)
2056                 return r;
2057
2058         if (p > 0 &&
2059             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2060                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2061                           f->path, i);
2062                 return -EBADMSG;
2063         }
2064
2065         if (offset)
2066                 *offset = ofs;
2067
2068         return 1;
2069 }
2070
2071 int journal_file_next_entry_for_data(
2072                 JournalFile *f,
2073                 Object *o, uint64_t p,
2074                 uint64_t data_offset,
2075                 direction_t direction,
2076                 Object **ret, uint64_t *offset) {
2077
2078         uint64_t n, i;
2079         int r;
2080         Object *d;
2081
2082         assert(f);
2083         assert(p > 0 || !o);
2084
2085         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2086         if (r < 0)
2087                 return r;
2088
2089         n = le64toh(d->data.n_entries);
2090         if (n <= 0)
2091                 return n;
2092
2093         if (!o)
2094                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2095         else {
2096                 if (o->object.type != OBJECT_ENTRY)
2097                         return -EINVAL;
2098
2099                 r = generic_array_bisect_plus_one(f,
2100                                                   le64toh(d->data.entry_offset),
2101                                                   le64toh(d->data.entry_array_offset),
2102                                                   le64toh(d->data.n_entries),
2103                                                   p,
2104                                                   test_object_offset,
2105                                                   DIRECTION_DOWN,
2106                                                   NULL, NULL,
2107                                                   &i);
2108
2109                 if (r <= 0)
2110                         return r;
2111
2112                 if (direction == DIRECTION_DOWN) {
2113                         if (i >= n - 1)
2114                                 return 0;
2115
2116                         i++;
2117                 } else {
2118                         if (i <= 0)
2119                                 return 0;
2120
2121                         i--;
2122                 }
2123
2124         }
2125
2126         return generic_array_get_plus_one(f,
2127                                           le64toh(d->data.entry_offset),
2128                                           le64toh(d->data.entry_array_offset),
2129                                           i,
2130                                           ret, offset);
2131 }
2132
2133 int journal_file_move_to_entry_by_offset_for_data(
2134                 JournalFile *f,
2135                 uint64_t data_offset,
2136                 uint64_t p,
2137                 direction_t direction,
2138                 Object **ret, uint64_t *offset) {
2139
2140         int r;
2141         Object *d;
2142
2143         assert(f);
2144
2145         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2146         if (r < 0)
2147                 return r;
2148
2149         return generic_array_bisect_plus_one(f,
2150                                              le64toh(d->data.entry_offset),
2151                                              le64toh(d->data.entry_array_offset),
2152                                              le64toh(d->data.n_entries),
2153                                              p,
2154                                              test_object_offset,
2155                                              direction,
2156                                              ret, offset, NULL);
2157 }
2158
2159 int journal_file_move_to_entry_by_monotonic_for_data(
2160                 JournalFile *f,
2161                 uint64_t data_offset,
2162                 sd_id128_t boot_id,
2163                 uint64_t monotonic,
2164                 direction_t direction,
2165                 Object **ret, uint64_t *offset) {
2166
2167         Object *o, *d;
2168         int r;
2169         uint64_t b, z;
2170
2171         assert(f);
2172
2173         /* First, seek by time */
2174         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2175         if (r < 0)
2176                 return r;
2177         if (r == 0)
2178                 return -ENOENT;
2179
2180         r = generic_array_bisect_plus_one(f,
2181                                           le64toh(o->data.entry_offset),
2182                                           le64toh(o->data.entry_array_offset),
2183                                           le64toh(o->data.n_entries),
2184                                           monotonic,
2185                                           test_object_monotonic,
2186                                           direction,
2187                                           NULL, &z, NULL);
2188         if (r <= 0)
2189                 return r;
2190
2191         /* And now, continue seeking until we find an entry that
2192          * exists in both bisection arrays */
2193
2194         for (;;) {
2195                 Object *qo;
2196                 uint64_t p, q;
2197
2198                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2199                 if (r < 0)
2200                         return r;
2201
2202                 r = generic_array_bisect_plus_one(f,
2203                                                   le64toh(d->data.entry_offset),
2204                                                   le64toh(d->data.entry_array_offset),
2205                                                   le64toh(d->data.n_entries),
2206                                                   z,
2207                                                   test_object_offset,
2208                                                   direction,
2209                                                   NULL, &p, NULL);
2210                 if (r <= 0)
2211                         return r;
2212
2213                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2214                 if (r < 0)
2215                         return r;
2216
2217                 r = generic_array_bisect_plus_one(f,
2218                                                   le64toh(o->data.entry_offset),
2219                                                   le64toh(o->data.entry_array_offset),
2220                                                   le64toh(o->data.n_entries),
2221                                                   p,
2222                                                   test_object_offset,
2223                                                   direction,
2224                                                   &qo, &q, NULL);
2225
2226                 if (r <= 0)
2227                         return r;
2228
2229                 if (p == q) {
2230                         if (ret)
2231                                 *ret = qo;
2232                         if (offset)
2233                                 *offset = q;
2234
2235                         return 1;
2236                 }
2237
2238                 z = q;
2239         }
2240 }
2241
2242 int journal_file_move_to_entry_by_seqnum_for_data(
2243                 JournalFile *f,
2244                 uint64_t data_offset,
2245                 uint64_t seqnum,
2246                 direction_t direction,
2247                 Object **ret, uint64_t *offset) {
2248
2249         Object *d;
2250         int r;
2251
2252         assert(f);
2253
2254         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2255         if (r < 0)
2256                 return r;
2257
2258         return generic_array_bisect_plus_one(f,
2259                                              le64toh(d->data.entry_offset),
2260                                              le64toh(d->data.entry_array_offset),
2261                                              le64toh(d->data.n_entries),
2262                                              seqnum,
2263                                              test_object_seqnum,
2264                                              direction,
2265                                              ret, offset, NULL);
2266 }
2267
2268 int journal_file_move_to_entry_by_realtime_for_data(
2269                 JournalFile *f,
2270                 uint64_t data_offset,
2271                 uint64_t realtime,
2272                 direction_t direction,
2273                 Object **ret, uint64_t *offset) {
2274
2275         Object *d;
2276         int r;
2277
2278         assert(f);
2279
2280         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2281         if (r < 0)
2282                 return r;
2283
2284         return generic_array_bisect_plus_one(f,
2285                                              le64toh(d->data.entry_offset),
2286                                              le64toh(d->data.entry_array_offset),
2287                                              le64toh(d->data.n_entries),
2288                                              realtime,
2289                                              test_object_realtime,
2290                                              direction,
2291                                              ret, offset, NULL);
2292 }
2293
2294 void journal_file_dump(JournalFile *f) {
2295         Object *o;
2296         int r;
2297         uint64_t p;
2298
2299         assert(f);
2300
2301         journal_file_print_header(f);
2302
2303         p = le64toh(f->header->header_size);
2304         while (p != 0) {
2305                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2306                 if (r < 0)
2307                         goto fail;
2308
2309                 switch (o->object.type) {
2310
2311                 case OBJECT_UNUSED:
2312                         printf("Type: OBJECT_UNUSED\n");
2313                         break;
2314
2315                 case OBJECT_DATA:
2316                         printf("Type: OBJECT_DATA\n");
2317                         break;
2318
2319                 case OBJECT_FIELD:
2320                         printf("Type: OBJECT_FIELD\n");
2321                         break;
2322
2323                 case OBJECT_ENTRY:
2324                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2325                                le64toh(o->entry.seqnum),
2326                                le64toh(o->entry.monotonic),
2327                                le64toh(o->entry.realtime));
2328                         break;
2329
2330                 case OBJECT_FIELD_HASH_TABLE:
2331                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2332                         break;
2333
2334                 case OBJECT_DATA_HASH_TABLE:
2335                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2336                         break;
2337
2338                 case OBJECT_ENTRY_ARRAY:
2339                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2340                         break;
2341
2342                 case OBJECT_TAG:
2343                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2344                                le64toh(o->tag.seqnum),
2345                                le64toh(o->tag.epoch));
2346                         break;
2347
2348                 default:
2349                         printf("Type: unknown (%u)\n", o->object.type);
2350                         break;
2351                 }
2352
2353                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2354                         printf("Flags: %s\n",
2355                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2356
2357                 if (p == le64toh(f->header->tail_object_offset))
2358                         p = 0;
2359                 else
2360                         p = p + ALIGN64(le64toh(o->object.size));
2361         }
2362
2363         return;
2364 fail:
2365         log_error("File corrupt");
2366 }
2367
2368 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2369         const char *x;
2370
2371         x = format_timestamp(buf, l, t);
2372         if (x)
2373                 return x;
2374         return " --- ";
2375 }
2376
2377 void journal_file_print_header(JournalFile *f) {
2378         char a[33], b[33], c[33], d[33];
2379         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2380         struct stat st;
2381         char bytes[FORMAT_BYTES_MAX];
2382
2383         assert(f);
2384
2385         printf("File Path: %s\n"
2386                "File ID: %s\n"
2387                "Machine ID: %s\n"
2388                "Boot ID: %s\n"
2389                "Sequential Number ID: %s\n"
2390                "State: %s\n"
2391                "Compatible Flags:%s%s\n"
2392                "Incompatible Flags:%s%s%s\n"
2393                "Header size: %"PRIu64"\n"
2394                "Arena size: %"PRIu64"\n"
2395                "Data Hash Table Size: %"PRIu64"\n"
2396                "Field Hash Table Size: %"PRIu64"\n"
2397                "Rotate Suggested: %s\n"
2398                "Head Sequential Number: %"PRIu64"\n"
2399                "Tail Sequential Number: %"PRIu64"\n"
2400                "Head Realtime Timestamp: %s\n"
2401                "Tail Realtime Timestamp: %s\n"
2402                "Tail Monotonic Timestamp: %s\n"
2403                "Objects: %"PRIu64"\n"
2404                "Entry Objects: %"PRIu64"\n",
2405                f->path,
2406                sd_id128_to_string(f->header->file_id, a),
2407                sd_id128_to_string(f->header->machine_id, b),
2408                sd_id128_to_string(f->header->boot_id, c),
2409                sd_id128_to_string(f->header->seqnum_id, d),
2410                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2411                f->header->state == STATE_ONLINE ? "ONLINE" :
2412                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2413                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2414                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2415                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2416                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2417                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2418                le64toh(f->header->header_size),
2419                le64toh(f->header->arena_size),
2420                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2421                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2422                yes_no(journal_file_rotate_suggested(f, 0)),
2423                le64toh(f->header->head_entry_seqnum),
2424                le64toh(f->header->tail_entry_seqnum),
2425                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2426                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2427                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2428                le64toh(f->header->n_objects),
2429                le64toh(f->header->n_entries));
2430
2431         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2432                 printf("Data Objects: %"PRIu64"\n"
2433                        "Data Hash Table Fill: %.1f%%\n",
2434                        le64toh(f->header->n_data),
2435                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2436
2437         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2438                 printf("Field Objects: %"PRIu64"\n"
2439                        "Field Hash Table Fill: %.1f%%\n",
2440                        le64toh(f->header->n_fields),
2441                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2442
2443         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2444                 printf("Tag Objects: %"PRIu64"\n",
2445                        le64toh(f->header->n_tags));
2446         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2447                 printf("Entry Array Objects: %"PRIu64"\n",
2448                        le64toh(f->header->n_entry_arrays));
2449
2450         if (fstat(f->fd, &st) >= 0)
2451                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2452 }
2453
2454 int journal_file_open(
2455                 const char *fname,
2456                 int flags,
2457                 mode_t mode,
2458                 bool compress,
2459                 bool seal,
2460                 JournalMetrics *metrics,
2461                 MMapCache *mmap_cache,
2462                 JournalFile *template,
2463                 JournalFile **ret) {
2464
2465         JournalFile *f;
2466         int r;
2467         bool newly_created = false;
2468
2469         assert(fname);
2470         assert(ret);
2471
2472         if ((flags & O_ACCMODE) != O_RDONLY &&
2473             (flags & O_ACCMODE) != O_RDWR)
2474                 return -EINVAL;
2475
2476         if (!endswith(fname, ".journal") &&
2477             !endswith(fname, ".journal~"))
2478                 return -EINVAL;
2479
2480         f = new0(JournalFile, 1);
2481         if (!f)
2482                 return -ENOMEM;
2483
2484         f->fd = -1;
2485         f->mode = mode;
2486
2487         f->flags = flags;
2488         f->prot = prot_from_flags(flags);
2489         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2490 #if defined(HAVE_LZ4)
2491         f->compress_lz4 = compress;
2492 #elif defined(HAVE_XZ)
2493         f->compress_xz = compress;
2494 #endif
2495 #ifdef HAVE_GCRYPT
2496         f->seal = seal;
2497 #endif
2498
2499         if (mmap_cache)
2500                 f->mmap = mmap_cache_ref(mmap_cache);
2501         else {
2502                 f->mmap = mmap_cache_new();
2503                 if (!f->mmap) {
2504                         r = -ENOMEM;
2505                         goto fail;
2506                 }
2507         }
2508
2509         f->path = strdup(fname);
2510         if (!f->path) {
2511                 r = -ENOMEM;
2512                 goto fail;
2513         }
2514
2515         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2516         if (!f->chain_cache) {
2517                 r = -ENOMEM;
2518                 goto fail;
2519         }
2520
2521         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2522         if (f->fd < 0) {
2523                 r = -errno;
2524                 goto fail;
2525         }
2526
2527         if (fstat(f->fd, &f->last_stat) < 0) {
2528                 r = -errno;
2529                 goto fail;
2530         }
2531
2532         if (f->last_stat.st_size == 0 && f->writable) {
2533                 uint64_t crtime;
2534
2535                 /* Let's attach the creation time to the journal file,
2536                  * so that the vacuuming code knows the age of this
2537                  * file even if the file might end up corrupted one
2538                  * day... Ideally we'd just use the creation time many
2539                  * file systems maintain for each file, but there is
2540                  * currently no usable API to query this, hence let's
2541                  * emulate this via extended attributes. If extended
2542                  * attributes are not supported we'll just skip this,
2543                  * and rely solely on mtime/atime/ctime of the file. */
2544
2545                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2546                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2547
2548 #ifdef HAVE_GCRYPT
2549                 /* Try to load the FSPRG state, and if we can't, then
2550                  * just don't do sealing */
2551                 if (f->seal) {
2552                         r = journal_file_fss_load(f);
2553                         if (r < 0)
2554                                 f->seal = false;
2555                 }
2556 #endif
2557
2558                 r = journal_file_init_header(f, template);
2559                 if (r < 0)
2560                         goto fail;
2561
2562                 if (fstat(f->fd, &f->last_stat) < 0) {
2563                         r = -errno;
2564                         goto fail;
2565                 }
2566
2567                 newly_created = true;
2568         }
2569
2570         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2571                 r = -EIO;
2572                 goto fail;
2573         }
2574
2575         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2576         if (f->header == MAP_FAILED) {
2577                 f->header = NULL;
2578                 r = -errno;
2579                 goto fail;
2580         }
2581
2582         if (!newly_created) {
2583                 r = journal_file_verify_header(f);
2584                 if (r < 0)
2585                         goto fail;
2586         }
2587
2588 #ifdef HAVE_GCRYPT
2589         if (!newly_created && f->writable) {
2590                 r = journal_file_fss_load(f);
2591                 if (r < 0)
2592                         goto fail;
2593         }
2594 #endif
2595
2596         if (f->writable) {
2597                 if (metrics) {
2598                         journal_default_metrics(metrics, f->fd);
2599                         f->metrics = *metrics;
2600                 } else if (template)
2601                         f->metrics = template->metrics;
2602
2603                 r = journal_file_refresh_header(f);
2604                 if (r < 0)
2605                         goto fail;
2606         }
2607
2608 #ifdef HAVE_GCRYPT
2609         r = journal_file_hmac_setup(f);
2610         if (r < 0)
2611                 goto fail;
2612 #endif
2613
2614         if (newly_created) {
2615                 r = journal_file_setup_field_hash_table(f);
2616                 if (r < 0)
2617                         goto fail;
2618
2619                 r = journal_file_setup_data_hash_table(f);
2620                 if (r < 0)
2621                         goto fail;
2622
2623 #ifdef HAVE_GCRYPT
2624                 r = journal_file_append_first_tag(f);
2625                 if (r < 0)
2626                         goto fail;
2627 #endif
2628         }
2629
2630         r = journal_file_map_field_hash_table(f);
2631         if (r < 0)
2632                 goto fail;
2633
2634         r = journal_file_map_data_hash_table(f);
2635         if (r < 0)
2636                 goto fail;
2637
2638         *ret = f;
2639         return 0;
2640
2641 fail:
2642         journal_file_close(f);
2643
2644         return r;
2645 }
2646
2647 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2648         _cleanup_free_ char *p = NULL;
2649         size_t l;
2650         JournalFile *old_file, *new_file = NULL;
2651         int r;
2652
2653         assert(f);
2654         assert(*f);
2655
2656         old_file = *f;
2657
2658         if (!old_file->writable)
2659                 return -EINVAL;
2660
2661         if (!endswith(old_file->path, ".journal"))
2662                 return -EINVAL;
2663
2664         l = strlen(old_file->path);
2665         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2666                      (int) l - 8, old_file->path,
2667                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2668                      le64toh((*f)->header->head_entry_seqnum),
2669                      le64toh((*f)->header->head_entry_realtime));
2670         if (r < 0)
2671                 return -ENOMEM;
2672
2673         r = rename(old_file->path, p);
2674         if (r < 0)
2675                 return -errno;
2676
2677         old_file->header->state = STATE_ARCHIVED;
2678
2679         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2680         journal_file_close(old_file);
2681
2682         *f = new_file;
2683         return r;
2684 }
2685
2686 int journal_file_open_reliably(
2687                 const char *fname,
2688                 int flags,
2689                 mode_t mode,
2690                 bool compress,
2691                 bool seal,
2692                 JournalMetrics *metrics,
2693                 MMapCache *mmap_cache,
2694                 JournalFile *template,
2695                 JournalFile **ret) {
2696
2697         int r;
2698         size_t l;
2699         _cleanup_free_ char *p = NULL;
2700
2701         r = journal_file_open(fname, flags, mode, compress, seal,
2702                               metrics, mmap_cache, template, ret);
2703         if (r != -EBADMSG && /* corrupted */
2704             r != -ENODATA && /* truncated */
2705             r != -EHOSTDOWN && /* other machine */
2706             r != -EPROTONOSUPPORT && /* incompatible feature */
2707             r != -EBUSY && /* unclean shutdown */
2708             r != -ESHUTDOWN /* already archived */)
2709                 return r;
2710
2711         if ((flags & O_ACCMODE) == O_RDONLY)
2712                 return r;
2713
2714         if (!(flags & O_CREAT))
2715                 return r;
2716
2717         if (!endswith(fname, ".journal"))
2718                 return r;
2719
2720         /* The file is corrupted. Rotate it away and try it again (but only once) */
2721
2722         l = strlen(fname);
2723         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2724                      (int) l - 8, fname,
2725                      (unsigned long long) now(CLOCK_REALTIME),
2726                      random_u64()) < 0)
2727                 return -ENOMEM;
2728
2729         r = rename(fname, p);
2730         if (r < 0)
2731                 return -errno;
2732
2733         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2734
2735         return journal_file_open(fname, flags, mode, compress, seal,
2736                                  metrics, mmap_cache, template, ret);
2737 }
2738
2739 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2740         uint64_t i, n;
2741         uint64_t q, xor_hash = 0;
2742         int r;
2743         EntryItem *items;
2744         dual_timestamp ts;
2745
2746         assert(from);
2747         assert(to);
2748         assert(o);
2749         assert(p);
2750
2751         if (!to->writable)
2752                 return -EPERM;
2753
2754         ts.monotonic = le64toh(o->entry.monotonic);
2755         ts.realtime = le64toh(o->entry.realtime);
2756
2757         n = journal_file_entry_n_items(o);
2758         /* alloca() can't take 0, hence let's allocate at least one */
2759         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2760
2761         for (i = 0; i < n; i++) {
2762                 uint64_t l, h;
2763                 le64_t le_hash;
2764                 size_t t;
2765                 void *data;
2766                 Object *u;
2767
2768                 q = le64toh(o->entry.items[i].object_offset);
2769                 le_hash = o->entry.items[i].hash;
2770
2771                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2772                 if (r < 0)
2773                         return r;
2774
2775                 if (le_hash != o->data.hash)
2776                         return -EBADMSG;
2777
2778                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2779                 t = (size_t) l;
2780
2781                 /* We hit the limit on 32bit machines */
2782                 if ((uint64_t) t != l)
2783                         return -E2BIG;
2784
2785                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2786 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2787                         size_t rsize;
2788
2789                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2790                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2791                         if (r < 0)
2792                                 return r;
2793
2794                         data = from->compress_buffer;
2795                         l = rsize;
2796 #else
2797                         return -EPROTONOSUPPORT;
2798 #endif
2799                 } else
2800                         data = o->data.payload;
2801
2802                 r = journal_file_append_data(to, data, l, &u, &h);
2803                 if (r < 0)
2804                         return r;
2805
2806                 xor_hash ^= le64toh(u->data.hash);
2807                 items[i].object_offset = htole64(h);
2808                 items[i].hash = u->data.hash;
2809
2810                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2811                 if (r < 0)
2812                         return r;
2813         }
2814
2815         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2816 }
2817
2818 void journal_default_metrics(JournalMetrics *m, int fd) {
2819         uint64_t fs_size = 0;
2820         struct statvfs ss;
2821         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2822
2823         assert(m);
2824         assert(fd >= 0);
2825
2826         if (fstatvfs(fd, &ss) >= 0)
2827                 fs_size = ss.f_frsize * ss.f_blocks;
2828
2829         if (m->max_use == (uint64_t) -1) {
2830
2831                 if (fs_size > 0) {
2832                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2833
2834                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2835                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2836
2837                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2838                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2839                 } else
2840                         m->max_use = DEFAULT_MAX_USE_LOWER;
2841         } else {
2842                 m->max_use = PAGE_ALIGN(m->max_use);
2843
2844                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2845                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2846         }
2847
2848         if (m->max_size == (uint64_t) -1) {
2849                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2850
2851                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2852                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2853         } else
2854                 m->max_size = PAGE_ALIGN(m->max_size);
2855
2856         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2857                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2858
2859         if (m->max_size*2 > m->max_use)
2860                 m->max_use = m->max_size*2;
2861
2862         if (m->min_size == (uint64_t) -1)
2863                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2864         else {
2865                 m->min_size = PAGE_ALIGN(m->min_size);
2866
2867                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2868                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2869
2870                 if (m->min_size > m->max_size)
2871                         m->max_size = m->min_size;
2872         }
2873
2874         if (m->keep_free == (uint64_t) -1) {
2875
2876                 if (fs_size > 0) {
2877                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2878
2879                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2880                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2881
2882                 } else
2883                         m->keep_free = DEFAULT_KEEP_FREE;
2884         }
2885
2886         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2887                   format_bytes(a, sizeof(a), m->max_use),
2888                   format_bytes(b, sizeof(b), m->max_size),
2889                   format_bytes(c, sizeof(c), m->min_size),
2890                   format_bytes(d, sizeof(d), m->keep_free));
2891 }
2892
2893 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2894         assert(f);
2895         assert(from || to);
2896
2897         if (from) {
2898                 if (f->header->head_entry_realtime == 0)
2899                         return -ENOENT;
2900
2901                 *from = le64toh(f->header->head_entry_realtime);
2902         }
2903
2904         if (to) {
2905                 if (f->header->tail_entry_realtime == 0)
2906                         return -ENOENT;
2907
2908                 *to = le64toh(f->header->tail_entry_realtime);
2909         }
2910
2911         return 1;
2912 }
2913
2914 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2915         Object *o;
2916         uint64_t p;
2917         int r;
2918
2919         assert(f);
2920         assert(from || to);
2921
2922         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2923         if (r <= 0)
2924                 return r;
2925
2926         if (le64toh(o->data.n_entries) <= 0)
2927                 return 0;
2928
2929         if (from) {
2930                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2931                 if (r < 0)
2932                         return r;
2933
2934                 *from = le64toh(o->entry.monotonic);
2935         }
2936
2937         if (to) {
2938                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2939                 if (r < 0)
2940                         return r;
2941
2942                 r = generic_array_get_plus_one(f,
2943                                                le64toh(o->data.entry_offset),
2944                                                le64toh(o->data.entry_array_offset),
2945                                                le64toh(o->data.n_entries)-1,
2946                                                &o, NULL);
2947                 if (r <= 0)
2948                         return r;
2949
2950                 *to = le64toh(o->entry.monotonic);
2951         }
2952
2953         return 1;
2954 }
2955
2956 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2957         assert(f);
2958
2959         /* If we gained new header fields we gained new features,
2960          * hence suggest a rotation */
2961         if (le64toh(f->header->header_size) < sizeof(Header)) {
2962                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2963                 return true;
2964         }
2965
2966         /* Let's check if the hash tables grew over a certain fill
2967          * level (75%, borrowing this value from Java's hash table
2968          * implementation), and if so suggest a rotation. To calculate
2969          * the fill level we need the n_data field, which only exists
2970          * in newer versions. */
2971
2972         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2973                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2974                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2975                                   f->path,
2976                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2977                                   le64toh(f->header->n_data),
2978                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2979                                   (unsigned long long) f->last_stat.st_size,
2980                                   f->last_stat.st_size / le64toh(f->header->n_data));
2981                         return true;
2982                 }
2983
2984         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2985                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2986                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2987                                   f->path,
2988                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2989                                   le64toh(f->header->n_fields),
2990                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2991                         return true;
2992                 }
2993
2994         /* Are the data objects properly indexed by field objects? */
2995         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2996             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2997             le64toh(f->header->n_data) > 0 &&
2998             le64toh(f->header->n_fields) == 0)
2999                 return true;
3000
3001         if (max_file_usec > 0) {
3002                 usec_t t, h;
3003
3004                 h = le64toh(f->header->head_entry_realtime);
3005                 t = now(CLOCK_REALTIME);
3006
3007                 if (h > 0 && t > h + max_file_usec)
3008                         return true;
3009         }
3010
3011         return false;
3012 }