chiark / gitweb /
util: make creation time xattr logic more generic
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
66
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
69
70 static int journal_file_set_online(JournalFile *f) {
71         assert(f);
72
73         if (!f->writable)
74                 return -EPERM;
75
76         if (!(f->fd >= 0 && f->header))
77                 return -EINVAL;
78
79         switch(f->header->state) {
80                 case STATE_ONLINE:
81                         return 0;
82
83                 case STATE_OFFLINE:
84                         f->header->state = STATE_ONLINE;
85                         fsync(f->fd);
86                         return 0;
87
88                 default:
89                         return -EINVAL;
90         }
91 }
92
93 int journal_file_set_offline(JournalFile *f) {
94         assert(f);
95
96         if (!f->writable)
97                 return -EPERM;
98
99         if (!(f->fd >= 0 && f->header))
100                 return -EINVAL;
101
102         if (f->header->state != STATE_ONLINE)
103                 return 0;
104
105         fsync(f->fd);
106
107         f->header->state = STATE_OFFLINE;
108
109         fsync(f->fd);
110
111         return 0;
112 }
113
114 void journal_file_close(JournalFile *f) {
115         assert(f);
116
117 #ifdef HAVE_GCRYPT
118         /* Write the final tag */
119         if (f->seal && f->writable)
120                 journal_file_append_tag(f);
121 #endif
122
123         /* Sync everything to disk, before we mark the file offline */
124         if (f->mmap && f->fd >= 0)
125                 mmap_cache_close_fd(f->mmap, f->fd);
126
127         journal_file_set_offline(f);
128
129         if (f->header)
130                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
131
132         safe_close(f->fd);
133         free(f->path);
134
135         if (f->mmap)
136                 mmap_cache_unref(f->mmap);
137
138         ordered_hashmap_free_free(f->chain_cache);
139
140 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
141         free(f->compress_buffer);
142 #endif
143
144 #ifdef HAVE_GCRYPT
145         if (f->fss_file)
146                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
147         else if (f->fsprg_state)
148                 free(f->fsprg_state);
149
150         free(f->fsprg_seed);
151
152         if (f->hmac)
153                 gcry_md_close(f->hmac);
154 #endif
155
156         free(f);
157 }
158
159 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
160         Header h = {};
161         ssize_t k;
162         int r;
163
164         assert(f);
165
166         memcpy(h.signature, HEADER_SIGNATURE, 8);
167         h.header_size = htole64(ALIGN64(sizeof(h)));
168
169         h.incompatible_flags |= htole32(
170                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
171                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
172
173         h.compatible_flags = htole32(
174                 f->seal * HEADER_COMPATIBLE_SEALED);
175
176         r = sd_id128_randomize(&h.file_id);
177         if (r < 0)
178                 return r;
179
180         if (template) {
181                 h.seqnum_id = template->header->seqnum_id;
182                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
183         } else
184                 h.seqnum_id = h.file_id;
185
186         k = pwrite(f->fd, &h, sizeof(h), 0);
187         if (k < 0)
188                 return -errno;
189
190         if (k != sizeof(h))
191                 return -EIO;
192
193         return 0;
194 }
195
196 static int journal_file_refresh_header(JournalFile *f) {
197         int r;
198         sd_id128_t boot_id;
199
200         assert(f);
201
202         r = sd_id128_get_machine(&f->header->machine_id);
203         if (r < 0)
204                 return r;
205
206         r = sd_id128_get_boot(&boot_id);
207         if (r < 0)
208                 return r;
209
210         if (sd_id128_equal(boot_id, f->header->boot_id))
211                 f->tail_entry_monotonic_valid = true;
212
213         f->header->boot_id = boot_id;
214
215         journal_file_set_online(f);
216
217         /* Sync the online state to disk */
218         fsync(f->fd);
219
220         return 0;
221 }
222
223 static int journal_file_verify_header(JournalFile *f) {
224         uint32_t flags;
225
226         assert(f);
227
228         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
229                 return -EBADMSG;
230
231         /* In both read and write mode we refuse to open files with
232          * incompatible flags we don't know */
233         flags = le32toh(f->header->incompatible_flags);
234         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
235                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
236                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
237                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
238                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
239                 if (flags)
240                         log_debug("Journal file %s uses incompatible flags %"PRIx32
241                                   " disabled at compilation time.", f->path, flags);
242                 return -EPROTONOSUPPORT;
243         }
244
245         /* When open for writing we refuse to open files with
246          * compatible flags, too */
247         flags = le32toh(f->header->compatible_flags);
248         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
249                 if (flags & ~HEADER_COMPATIBLE_ANY)
250                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
251                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
252                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
253                 if (flags)
254                         log_debug("Journal file %s uses compatible flags %"PRIx32
255                                   " disabled at compilation time.", f->path, flags);
256                 return -EPROTONOSUPPORT;
257         }
258
259         if (f->header->state >= _STATE_MAX)
260                 return -EBADMSG;
261
262         /* The first addition was n_data, so check that we are at least this large */
263         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
264                 return -EBADMSG;
265
266         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
267                 return -EBADMSG;
268
269         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
270                 return -ENODATA;
271
272         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
273                 return -ENODATA;
274
275         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
276             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
277             !VALID64(le64toh(f->header->tail_object_offset)) ||
278             !VALID64(le64toh(f->header->entry_array_offset)))
279                 return -ENODATA;
280
281         if (f->writable) {
282                 uint8_t state;
283                 sd_id128_t machine_id;
284                 int r;
285
286                 r = sd_id128_get_machine(&machine_id);
287                 if (r < 0)
288                         return r;
289
290                 if (!sd_id128_equal(machine_id, f->header->machine_id))
291                         return -EHOSTDOWN;
292
293                 state = f->header->state;
294
295                 if (state == STATE_ONLINE) {
296                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
297                         return -EBUSY;
298                 } else if (state == STATE_ARCHIVED)
299                         return -ESHUTDOWN;
300                 else if (state != STATE_OFFLINE) {
301                         log_debug("Journal file %s has unknown state %u.", f->path, state);
302                         return -EBUSY;
303                 }
304         }
305
306         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
307         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
308
309         f->seal = JOURNAL_HEADER_SEALED(f->header);
310
311         return 0;
312 }
313
314 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
315         uint64_t old_size, new_size;
316         int r;
317
318         assert(f);
319
320         /* We assume that this file is not sparse, and we know that
321          * for sure, since we always call posix_fallocate()
322          * ourselves */
323
324         old_size =
325                 le64toh(f->header->header_size) +
326                 le64toh(f->header->arena_size);
327
328         new_size = PAGE_ALIGN(offset + size);
329         if (new_size < le64toh(f->header->header_size))
330                 new_size = le64toh(f->header->header_size);
331
332         if (new_size <= old_size)
333                 return 0;
334
335         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
336                 return -E2BIG;
337
338         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
339                 struct statvfs svfs;
340
341                 if (fstatvfs(f->fd, &svfs) >= 0) {
342                         uint64_t available;
343
344                         available = svfs.f_bfree * svfs.f_bsize;
345
346                         if (available >= f->metrics.keep_free)
347                                 available -= f->metrics.keep_free;
348                         else
349                                 available = 0;
350
351                         if (new_size - old_size > available)
352                                 return -E2BIG;
353                 }
354         }
355
356         /* Increase by larger blocks at once */
357         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
358         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
359                 new_size = f->metrics.max_size;
360
361         /* Note that the glibc fallocate() fallback is very
362            inefficient, hence we try to minimize the allocation area
363            as we can. */
364         r = posix_fallocate(f->fd, old_size, new_size - old_size);
365         if (r != 0)
366                 return -r;
367
368         if (fstat(f->fd, &f->last_stat) < 0)
369                 return -errno;
370
371         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
372
373         return 0;
374 }
375
376 static unsigned type_to_context(ObjectType type) {
377         /* One context for each type, plus one catch-all for the rest */
378         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
379         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
380 }
381
382 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
383         assert(f);
384         assert(ret);
385
386         if (size <= 0)
387                 return -EINVAL;
388
389         /* Avoid SIGBUS on invalid accesses */
390         if (offset + size > (uint64_t) f->last_stat.st_size) {
391                 /* Hmm, out of range? Let's refresh the fstat() data
392                  * first, before we trust that check. */
393
394                 if (fstat(f->fd, &f->last_stat) < 0 ||
395                     offset + size > (uint64_t) f->last_stat.st_size)
396                         return -EADDRNOTAVAIL;
397         }
398
399         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
400 }
401
402 static uint64_t minimum_header_size(Object *o) {
403
404         static const uint64_t table[] = {
405                 [OBJECT_DATA] = sizeof(DataObject),
406                 [OBJECT_FIELD] = sizeof(FieldObject),
407                 [OBJECT_ENTRY] = sizeof(EntryObject),
408                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411                 [OBJECT_TAG] = sizeof(TagObject),
412         };
413
414         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415                 return sizeof(ObjectHeader);
416
417         return table[o->object.type];
418 }
419
420 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
421         int r;
422         void *t;
423         Object *o;
424         uint64_t s;
425
426         assert(f);
427         assert(ret);
428
429         /* Objects may only be located at multiple of 64 bit */
430         if (!VALID64(offset))
431                 return -EFAULT;
432
433         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
434         if (r < 0)
435                 return r;
436
437         o = (Object*) t;
438         s = le64toh(o->object.size);
439
440         if (s < sizeof(ObjectHeader))
441                 return -EBADMSG;
442
443         if (o->object.type <= OBJECT_UNUSED)
444                 return -EBADMSG;
445
446         if (s < minimum_header_size(o))
447                 return -EBADMSG;
448
449         if (type > OBJECT_UNUSED && o->object.type != type)
450                 return -EBADMSG;
451
452         if (s > sizeof(ObjectHeader)) {
453                 r = journal_file_move_to(f, type, false, offset, s, &t);
454                 if (r < 0)
455                         return r;
456
457                 o = (Object*) t;
458         }
459
460         *ret = o;
461         return 0;
462 }
463
464 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
465         uint64_t r;
466
467         assert(f);
468
469         r = le64toh(f->header->tail_entry_seqnum) + 1;
470
471         if (seqnum) {
472                 /* If an external seqnum counter was passed, we update
473                  * both the local and the external one, and set it to
474                  * the maximum of both */
475
476                 if (*seqnum + 1 > r)
477                         r = *seqnum + 1;
478
479                 *seqnum = r;
480         }
481
482         f->header->tail_entry_seqnum = htole64(r);
483
484         if (f->header->head_entry_seqnum == 0)
485                 f->header->head_entry_seqnum = htole64(r);
486
487         return r;
488 }
489
490 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
491         int r;
492         uint64_t p;
493         Object *tail, *o;
494         void *t;
495
496         assert(f);
497         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
498         assert(size >= sizeof(ObjectHeader));
499         assert(offset);
500         assert(ret);
501
502         r = journal_file_set_online(f);
503         if (r < 0)
504                 return r;
505
506         p = le64toh(f->header->tail_object_offset);
507         if (p == 0)
508                 p = le64toh(f->header->header_size);
509         else {
510                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
511                 if (r < 0)
512                         return r;
513
514                 p += ALIGN64(le64toh(tail->object.size));
515         }
516
517         r = journal_file_allocate(f, p, size);
518         if (r < 0)
519                 return r;
520
521         r = journal_file_move_to(f, type, false, p, size, &t);
522         if (r < 0)
523                 return r;
524
525         o = (Object*) t;
526
527         zero(o->object);
528         o->object.type = type;
529         o->object.size = htole64(size);
530
531         f->header->tail_object_offset = htole64(p);
532         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
533
534         *ret = o;
535         *offset = p;
536
537         return 0;
538 }
539
540 static int journal_file_setup_data_hash_table(JournalFile *f) {
541         uint64_t s, p;
542         Object *o;
543         int r;
544
545         assert(f);
546
547         /* We estimate that we need 1 hash table entry per 768 of
548            journal file and we want to make sure we never get beyond
549            75% fill level. Calculate the hash table size for the
550            maximum file size based on these metrics. */
551
552         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
553         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
554                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
555
556         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
557
558         r = journal_file_append_object(f,
559                                        OBJECT_DATA_HASH_TABLE,
560                                        offsetof(Object, hash_table.items) + s,
561                                        &o, &p);
562         if (r < 0)
563                 return r;
564
565         memzero(o->hash_table.items, s);
566
567         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
568         f->header->data_hash_table_size = htole64(s);
569
570         return 0;
571 }
572
573 static int journal_file_setup_field_hash_table(JournalFile *f) {
574         uint64_t s, p;
575         Object *o;
576         int r;
577
578         assert(f);
579
580         /* We use a fixed size hash table for the fields as this
581          * number should grow very slowly only */
582
583         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
584         r = journal_file_append_object(f,
585                                        OBJECT_FIELD_HASH_TABLE,
586                                        offsetof(Object, hash_table.items) + s,
587                                        &o, &p);
588         if (r < 0)
589                 return r;
590
591         memzero(o->hash_table.items, s);
592
593         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
594         f->header->field_hash_table_size = htole64(s);
595
596         return 0;
597 }
598
599 static int journal_file_map_data_hash_table(JournalFile *f) {
600         uint64_t s, p;
601         void *t;
602         int r;
603
604         assert(f);
605
606         p = le64toh(f->header->data_hash_table_offset);
607         s = le64toh(f->header->data_hash_table_size);
608
609         r = journal_file_move_to(f,
610                                  OBJECT_DATA_HASH_TABLE,
611                                  true,
612                                  p, s,
613                                  &t);
614         if (r < 0)
615                 return r;
616
617         f->data_hash_table = t;
618         return 0;
619 }
620
621 static int journal_file_map_field_hash_table(JournalFile *f) {
622         uint64_t s, p;
623         void *t;
624         int r;
625
626         assert(f);
627
628         p = le64toh(f->header->field_hash_table_offset);
629         s = le64toh(f->header->field_hash_table_size);
630
631         r = journal_file_move_to(f,
632                                  OBJECT_FIELD_HASH_TABLE,
633                                  true,
634                                  p, s,
635                                  &t);
636         if (r < 0)
637                 return r;
638
639         f->field_hash_table = t;
640         return 0;
641 }
642
643 static int journal_file_link_field(
644                 JournalFile *f,
645                 Object *o,
646                 uint64_t offset,
647                 uint64_t hash) {
648
649         uint64_t p, h;
650         int r;
651
652         assert(f);
653         assert(o);
654         assert(offset > 0);
655
656         if (o->object.type != OBJECT_FIELD)
657                 return -EINVAL;
658
659         /* This might alter the window we are looking at */
660
661         o->field.next_hash_offset = o->field.head_data_offset = 0;
662
663         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
664         p = le64toh(f->field_hash_table[h].tail_hash_offset);
665         if (p == 0)
666                 f->field_hash_table[h].head_hash_offset = htole64(offset);
667         else {
668                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
669                 if (r < 0)
670                         return r;
671
672                 o->field.next_hash_offset = htole64(offset);
673         }
674
675         f->field_hash_table[h].tail_hash_offset = htole64(offset);
676
677         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
678                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
679
680         return 0;
681 }
682
683 static int journal_file_link_data(
684                 JournalFile *f,
685                 Object *o,
686                 uint64_t offset,
687                 uint64_t hash) {
688
689         uint64_t p, h;
690         int r;
691
692         assert(f);
693         assert(o);
694         assert(offset > 0);
695
696         if (o->object.type != OBJECT_DATA)
697                 return -EINVAL;
698
699         /* This might alter the window we are looking at */
700
701         o->data.next_hash_offset = o->data.next_field_offset = 0;
702         o->data.entry_offset = o->data.entry_array_offset = 0;
703         o->data.n_entries = 0;
704
705         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
706         p = le64toh(f->data_hash_table[h].tail_hash_offset);
707         if (p == 0)
708                 /* Only entry in the hash table is easy */
709                 f->data_hash_table[h].head_hash_offset = htole64(offset);
710         else {
711                 /* Move back to the previous data object, to patch in
712                  * pointer */
713
714                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
715                 if (r < 0)
716                         return r;
717
718                 o->data.next_hash_offset = htole64(offset);
719         }
720
721         f->data_hash_table[h].tail_hash_offset = htole64(offset);
722
723         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
724                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
725
726         return 0;
727 }
728
729 int journal_file_find_field_object_with_hash(
730                 JournalFile *f,
731                 const void *field, uint64_t size, uint64_t hash,
732                 Object **ret, uint64_t *offset) {
733
734         uint64_t p, osize, h;
735         int r;
736
737         assert(f);
738         assert(field && size > 0);
739
740         osize = offsetof(Object, field.payload) + size;
741
742         if (f->header->field_hash_table_size == 0)
743                 return -EBADMSG;
744
745         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
746         p = le64toh(f->field_hash_table[h].head_hash_offset);
747
748         while (p > 0) {
749                 Object *o;
750
751                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
752                 if (r < 0)
753                         return r;
754
755                 if (le64toh(o->field.hash) == hash &&
756                     le64toh(o->object.size) == osize &&
757                     memcmp(o->field.payload, field, size) == 0) {
758
759                         if (ret)
760                                 *ret = o;
761                         if (offset)
762                                 *offset = p;
763
764                         return 1;
765                 }
766
767                 p = le64toh(o->field.next_hash_offset);
768         }
769
770         return 0;
771 }
772
773 int journal_file_find_field_object(
774                 JournalFile *f,
775                 const void *field, uint64_t size,
776                 Object **ret, uint64_t *offset) {
777
778         uint64_t hash;
779
780         assert(f);
781         assert(field && size > 0);
782
783         hash = hash64(field, size);
784
785         return journal_file_find_field_object_with_hash(f,
786                                                         field, size, hash,
787                                                         ret, offset);
788 }
789
790 int journal_file_find_data_object_with_hash(
791                 JournalFile *f,
792                 const void *data, uint64_t size, uint64_t hash,
793                 Object **ret, uint64_t *offset) {
794
795         uint64_t p, osize, h;
796         int r;
797
798         assert(f);
799         assert(data || size == 0);
800
801         osize = offsetof(Object, data.payload) + size;
802
803         if (f->header->data_hash_table_size == 0)
804                 return -EBADMSG;
805
806         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
807         p = le64toh(f->data_hash_table[h].head_hash_offset);
808
809         while (p > 0) {
810                 Object *o;
811
812                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
813                 if (r < 0)
814                         return r;
815
816                 if (le64toh(o->data.hash) != hash)
817                         goto next;
818
819                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
820 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
821                         uint64_t l;
822                         size_t rsize;
823
824                         l = le64toh(o->object.size);
825                         if (l <= offsetof(Object, data.payload))
826                                 return -EBADMSG;
827
828                         l -= offsetof(Object, data.payload);
829
830                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
831                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
832                         if (r < 0)
833                                 return r;
834
835                         if (rsize == size &&
836                             memcmp(f->compress_buffer, data, size) == 0) {
837
838                                 if (ret)
839                                         *ret = o;
840
841                                 if (offset)
842                                         *offset = p;
843
844                                 return 1;
845                         }
846 #else
847                         return -EPROTONOSUPPORT;
848 #endif
849                 } else if (le64toh(o->object.size) == osize &&
850                            memcmp(o->data.payload, data, size) == 0) {
851
852                         if (ret)
853                                 *ret = o;
854
855                         if (offset)
856                                 *offset = p;
857
858                         return 1;
859                 }
860
861         next:
862                 p = le64toh(o->data.next_hash_offset);
863         }
864
865         return 0;
866 }
867
868 int journal_file_find_data_object(
869                 JournalFile *f,
870                 const void *data, uint64_t size,
871                 Object **ret, uint64_t *offset) {
872
873         uint64_t hash;
874
875         assert(f);
876         assert(data || size == 0);
877
878         hash = hash64(data, size);
879
880         return journal_file_find_data_object_with_hash(f,
881                                                        data, size, hash,
882                                                        ret, offset);
883 }
884
885 static int journal_file_append_field(
886                 JournalFile *f,
887                 const void *field, uint64_t size,
888                 Object **ret, uint64_t *offset) {
889
890         uint64_t hash, p;
891         uint64_t osize;
892         Object *o;
893         int r;
894
895         assert(f);
896         assert(field && size > 0);
897
898         hash = hash64(field, size);
899
900         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
901         if (r < 0)
902                 return r;
903         else if (r > 0) {
904
905                 if (ret)
906                         *ret = o;
907
908                 if (offset)
909                         *offset = p;
910
911                 return 0;
912         }
913
914         osize = offsetof(Object, field.payload) + size;
915         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
916         if (r < 0)
917                 return r;
918
919         o->field.hash = htole64(hash);
920         memcpy(o->field.payload, field, size);
921
922         r = journal_file_link_field(f, o, p, hash);
923         if (r < 0)
924                 return r;
925
926         /* The linking might have altered the window, so let's
927          * refresh our pointer */
928         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
929         if (r < 0)
930                 return r;
931
932 #ifdef HAVE_GCRYPT
933         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
934         if (r < 0)
935                 return r;
936 #endif
937
938         if (ret)
939                 *ret = o;
940
941         if (offset)
942                 *offset = p;
943
944         return 0;
945 }
946
947 static int journal_file_append_data(
948                 JournalFile *f,
949                 const void *data, uint64_t size,
950                 Object **ret, uint64_t *offset) {
951
952         uint64_t hash, p;
953         uint64_t osize;
954         Object *o;
955         int r, compression = 0;
956         const void *eq;
957
958         assert(f);
959         assert(data || size == 0);
960
961         hash = hash64(data, size);
962
963         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
964         if (r < 0)
965                 return r;
966         else if (r > 0) {
967
968                 if (ret)
969                         *ret = o;
970
971                 if (offset)
972                         *offset = p;
973
974                 return 0;
975         }
976
977         osize = offsetof(Object, data.payload) + size;
978         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
979         if (r < 0)
980                 return r;
981
982         o->data.hash = htole64(hash);
983
984 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
985         if (f->compress_xz &&
986             size >= COMPRESSION_SIZE_THRESHOLD) {
987                 size_t rsize;
988
989                 compression = compress_blob(data, size, o->data.payload, &rsize);
990
991                 if (compression) {
992                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
993                         o->object.flags |= compression;
994
995                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
996                                   size, rsize, object_compressed_to_string(compression));
997                 }
998         }
999 #endif
1000
1001         if (!compression && size > 0)
1002                 memcpy(o->data.payload, data, size);
1003
1004         r = journal_file_link_data(f, o, p, hash);
1005         if (r < 0)
1006                 return r;
1007
1008         /* The linking might have altered the window, so let's
1009          * refresh our pointer */
1010         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1011         if (r < 0)
1012                 return r;
1013
1014         if (!data)
1015                 eq = NULL;
1016         else
1017                 eq = memchr(data, '=', size);
1018         if (eq && eq > data) {
1019                 Object *fo = NULL;
1020                 uint64_t fp;
1021
1022                 /* Create field object ... */
1023                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1024                 if (r < 0)
1025                         return r;
1026
1027                 /* ... and link it in. */
1028                 o->data.next_field_offset = fo->field.head_data_offset;
1029                 fo->field.head_data_offset = le64toh(p);
1030         }
1031
1032 #ifdef HAVE_GCRYPT
1033         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1034         if (r < 0)
1035                 return r;
1036 #endif
1037
1038         if (ret)
1039                 *ret = o;
1040
1041         if (offset)
1042                 *offset = p;
1043
1044         return 0;
1045 }
1046
1047 uint64_t journal_file_entry_n_items(Object *o) {
1048         assert(o);
1049
1050         if (o->object.type != OBJECT_ENTRY)
1051                 return 0;
1052
1053         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1054 }
1055
1056 uint64_t journal_file_entry_array_n_items(Object *o) {
1057         assert(o);
1058
1059         if (o->object.type != OBJECT_ENTRY_ARRAY)
1060                 return 0;
1061
1062         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1063 }
1064
1065 uint64_t journal_file_hash_table_n_items(Object *o) {
1066         assert(o);
1067
1068         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1069             o->object.type != OBJECT_FIELD_HASH_TABLE)
1070                 return 0;
1071
1072         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1073 }
1074
1075 static int link_entry_into_array(JournalFile *f,
1076                                  le64_t *first,
1077                                  le64_t *idx,
1078                                  uint64_t p) {
1079         int r;
1080         uint64_t n = 0, ap = 0, q, i, a, hidx;
1081         Object *o;
1082
1083         assert(f);
1084         assert(first);
1085         assert(idx);
1086         assert(p > 0);
1087
1088         a = le64toh(*first);
1089         i = hidx = le64toh(*idx);
1090         while (a > 0) {
1091
1092                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1093                 if (r < 0)
1094                         return r;
1095
1096                 n = journal_file_entry_array_n_items(o);
1097                 if (i < n) {
1098                         o->entry_array.items[i] = htole64(p);
1099                         *idx = htole64(hidx + 1);
1100                         return 0;
1101                 }
1102
1103                 i -= n;
1104                 ap = a;
1105                 a = le64toh(o->entry_array.next_entry_array_offset);
1106         }
1107
1108         if (hidx > n)
1109                 n = (hidx+1) * 2;
1110         else
1111                 n = n * 2;
1112
1113         if (n < 4)
1114                 n = 4;
1115
1116         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1117                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1118                                        &o, &q);
1119         if (r < 0)
1120                 return r;
1121
1122 #ifdef HAVE_GCRYPT
1123         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1124         if (r < 0)
1125                 return r;
1126 #endif
1127
1128         o->entry_array.items[i] = htole64(p);
1129
1130         if (ap == 0)
1131                 *first = htole64(q);
1132         else {
1133                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1134                 if (r < 0)
1135                         return r;
1136
1137                 o->entry_array.next_entry_array_offset = htole64(q);
1138         }
1139
1140         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1141                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1142
1143         *idx = htole64(hidx + 1);
1144
1145         return 0;
1146 }
1147
1148 static int link_entry_into_array_plus_one(JournalFile *f,
1149                                           le64_t *extra,
1150                                           le64_t *first,
1151                                           le64_t *idx,
1152                                           uint64_t p) {
1153
1154         int r;
1155
1156         assert(f);
1157         assert(extra);
1158         assert(first);
1159         assert(idx);
1160         assert(p > 0);
1161
1162         if (*idx == 0)
1163                 *extra = htole64(p);
1164         else {
1165                 le64_t i;
1166
1167                 i = htole64(le64toh(*idx) - 1);
1168                 r = link_entry_into_array(f, first, &i, p);
1169                 if (r < 0)
1170                         return r;
1171         }
1172
1173         *idx = htole64(le64toh(*idx) + 1);
1174         return 0;
1175 }
1176
1177 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1178         uint64_t p;
1179         int r;
1180         assert(f);
1181         assert(o);
1182         assert(offset > 0);
1183
1184         p = le64toh(o->entry.items[i].object_offset);
1185         if (p == 0)
1186                 return -EINVAL;
1187
1188         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1189         if (r < 0)
1190                 return r;
1191
1192         return link_entry_into_array_plus_one(f,
1193                                               &o->data.entry_offset,
1194                                               &o->data.entry_array_offset,
1195                                               &o->data.n_entries,
1196                                               offset);
1197 }
1198
1199 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1200         uint64_t n, i;
1201         int r;
1202
1203         assert(f);
1204         assert(o);
1205         assert(offset > 0);
1206
1207         if (o->object.type != OBJECT_ENTRY)
1208                 return -EINVAL;
1209
1210         __sync_synchronize();
1211
1212         /* Link up the entry itself */
1213         r = link_entry_into_array(f,
1214                                   &f->header->entry_array_offset,
1215                                   &f->header->n_entries,
1216                                   offset);
1217         if (r < 0)
1218                 return r;
1219
1220         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1221
1222         if (f->header->head_entry_realtime == 0)
1223                 f->header->head_entry_realtime = o->entry.realtime;
1224
1225         f->header->tail_entry_realtime = o->entry.realtime;
1226         f->header->tail_entry_monotonic = o->entry.monotonic;
1227
1228         f->tail_entry_monotonic_valid = true;
1229
1230         /* Link up the items */
1231         n = journal_file_entry_n_items(o);
1232         for (i = 0; i < n; i++) {
1233                 r = journal_file_link_entry_item(f, o, offset, i);
1234                 if (r < 0)
1235                         return r;
1236         }
1237
1238         return 0;
1239 }
1240
1241 static int journal_file_append_entry_internal(
1242                 JournalFile *f,
1243                 const dual_timestamp *ts,
1244                 uint64_t xor_hash,
1245                 const EntryItem items[], unsigned n_items,
1246                 uint64_t *seqnum,
1247                 Object **ret, uint64_t *offset) {
1248         uint64_t np;
1249         uint64_t osize;
1250         Object *o;
1251         int r;
1252
1253         assert(f);
1254         assert(items || n_items == 0);
1255         assert(ts);
1256
1257         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1258
1259         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1260         if (r < 0)
1261                 return r;
1262
1263         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1264         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1265         o->entry.realtime = htole64(ts->realtime);
1266         o->entry.monotonic = htole64(ts->monotonic);
1267         o->entry.xor_hash = htole64(xor_hash);
1268         o->entry.boot_id = f->header->boot_id;
1269
1270 #ifdef HAVE_GCRYPT
1271         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1272         if (r < 0)
1273                 return r;
1274 #endif
1275
1276         r = journal_file_link_entry(f, o, np);
1277         if (r < 0)
1278                 return r;
1279
1280         if (ret)
1281                 *ret = o;
1282
1283         if (offset)
1284                 *offset = np;
1285
1286         return 0;
1287 }
1288
1289 void journal_file_post_change(JournalFile *f) {
1290         assert(f);
1291
1292         /* inotify() does not receive IN_MODIFY events from file
1293          * accesses done via mmap(). After each access we hence
1294          * trigger IN_MODIFY by truncating the journal file to its
1295          * current size which triggers IN_MODIFY. */
1296
1297         __sync_synchronize();
1298
1299         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1300                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1301 }
1302
1303 static int entry_item_cmp(const void *_a, const void *_b) {
1304         const EntryItem *a = _a, *b = _b;
1305
1306         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1307                 return -1;
1308         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1309                 return 1;
1310         return 0;
1311 }
1312
1313 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1314         unsigned i;
1315         EntryItem *items;
1316         int r;
1317         uint64_t xor_hash = 0;
1318         struct dual_timestamp _ts;
1319
1320         assert(f);
1321         assert(iovec || n_iovec == 0);
1322
1323         if (!ts) {
1324                 dual_timestamp_get(&_ts);
1325                 ts = &_ts;
1326         }
1327
1328         if (f->tail_entry_monotonic_valid &&
1329             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1330                 return -EINVAL;
1331
1332 #ifdef HAVE_GCRYPT
1333         r = journal_file_maybe_append_tag(f, ts->realtime);
1334         if (r < 0)
1335                 return r;
1336 #endif
1337
1338         /* alloca() can't take 0, hence let's allocate at least one */
1339         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1340
1341         for (i = 0; i < n_iovec; i++) {
1342                 uint64_t p;
1343                 Object *o;
1344
1345                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1346                 if (r < 0)
1347                         return r;
1348
1349                 xor_hash ^= le64toh(o->data.hash);
1350                 items[i].object_offset = htole64(p);
1351                 items[i].hash = o->data.hash;
1352         }
1353
1354         /* Order by the position on disk, in order to improve seek
1355          * times for rotating media. */
1356         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1357
1358         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1359
1360         journal_file_post_change(f);
1361
1362         return r;
1363 }
1364
1365 typedef struct ChainCacheItem {
1366         uint64_t first; /* the array at the beginning of the chain */
1367         uint64_t array; /* the cached array */
1368         uint64_t begin; /* the first item in the cached array */
1369         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1370         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1371 } ChainCacheItem;
1372
1373 static void chain_cache_put(
1374                 OrderedHashmap *h,
1375                 ChainCacheItem *ci,
1376                 uint64_t first,
1377                 uint64_t array,
1378                 uint64_t begin,
1379                 uint64_t total,
1380                 uint64_t last_index) {
1381
1382         if (!ci) {
1383                 /* If the chain item to cache for this chain is the
1384                  * first one it's not worth caching anything */
1385                 if (array == first)
1386                         return;
1387
1388                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1389                         ci = ordered_hashmap_steal_first(h);
1390                         assert(ci);
1391                 } else {
1392                         ci = new(ChainCacheItem, 1);
1393                         if (!ci)
1394                                 return;
1395                 }
1396
1397                 ci->first = first;
1398
1399                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1400                         free(ci);
1401                         return;
1402                 }
1403         } else
1404                 assert(ci->first == first);
1405
1406         ci->array = array;
1407         ci->begin = begin;
1408         ci->total = total;
1409         ci->last_index = last_index;
1410 }
1411
1412 static int generic_array_get(
1413                 JournalFile *f,
1414                 uint64_t first,
1415                 uint64_t i,
1416                 Object **ret, uint64_t *offset) {
1417
1418         Object *o;
1419         uint64_t p = 0, a, t = 0;
1420         int r;
1421         ChainCacheItem *ci;
1422
1423         assert(f);
1424
1425         a = first;
1426
1427         /* Try the chain cache first */
1428         ci = ordered_hashmap_get(f->chain_cache, &first);
1429         if (ci && i > ci->total) {
1430                 a = ci->array;
1431                 i -= ci->total;
1432                 t = ci->total;
1433         }
1434
1435         while (a > 0) {
1436                 uint64_t k;
1437
1438                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1439                 if (r < 0)
1440                         return r;
1441
1442                 k = journal_file_entry_array_n_items(o);
1443                 if (i < k) {
1444                         p = le64toh(o->entry_array.items[i]);
1445                         goto found;
1446                 }
1447
1448                 i -= k;
1449                 t += k;
1450                 a = le64toh(o->entry_array.next_entry_array_offset);
1451         }
1452
1453         return 0;
1454
1455 found:
1456         /* Let's cache this item for the next invocation */
1457         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1458
1459         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1460         if (r < 0)
1461                 return r;
1462
1463         if (ret)
1464                 *ret = o;
1465
1466         if (offset)
1467                 *offset = p;
1468
1469         return 1;
1470 }
1471
1472 static int generic_array_get_plus_one(
1473                 JournalFile *f,
1474                 uint64_t extra,
1475                 uint64_t first,
1476                 uint64_t i,
1477                 Object **ret, uint64_t *offset) {
1478
1479         Object *o;
1480
1481         assert(f);
1482
1483         if (i == 0) {
1484                 int r;
1485
1486                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1487                 if (r < 0)
1488                         return r;
1489
1490                 if (ret)
1491                         *ret = o;
1492
1493                 if (offset)
1494                         *offset = extra;
1495
1496                 return 1;
1497         }
1498
1499         return generic_array_get(f, first, i-1, ret, offset);
1500 }
1501
1502 enum {
1503         TEST_FOUND,
1504         TEST_LEFT,
1505         TEST_RIGHT
1506 };
1507
1508 static int generic_array_bisect(
1509                 JournalFile *f,
1510                 uint64_t first,
1511                 uint64_t n,
1512                 uint64_t needle,
1513                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1514                 direction_t direction,
1515                 Object **ret,
1516                 uint64_t *offset,
1517                 uint64_t *idx) {
1518
1519         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1520         bool subtract_one = false;
1521         Object *o, *array = NULL;
1522         int r;
1523         ChainCacheItem *ci;
1524
1525         assert(f);
1526         assert(test_object);
1527
1528         /* Start with the first array in the chain */
1529         a = first;
1530
1531         ci = ordered_hashmap_get(f->chain_cache, &first);
1532         if (ci && n > ci->total) {
1533                 /* Ah, we have iterated this bisection array chain
1534                  * previously! Let's see if we can skip ahead in the
1535                  * chain, as far as the last time. But we can't jump
1536                  * backwards in the chain, so let's check that
1537                  * first. */
1538
1539                 r = test_object(f, ci->begin, needle);
1540                 if (r < 0)
1541                         return r;
1542
1543                 if (r == TEST_LEFT) {
1544                         /* OK, what we are looking for is right of the
1545                          * begin of this EntryArray, so let's jump
1546                          * straight to previously cached array in the
1547                          * chain */
1548
1549                         a = ci->array;
1550                         n -= ci->total;
1551                         t = ci->total;
1552                         last_index = ci->last_index;
1553                 }
1554         }
1555
1556         while (a > 0) {
1557                 uint64_t left, right, k, lp;
1558
1559                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1560                 if (r < 0)
1561                         return r;
1562
1563                 k = journal_file_entry_array_n_items(array);
1564                 right = MIN(k, n);
1565                 if (right <= 0)
1566                         return 0;
1567
1568                 i = right - 1;
1569                 lp = p = le64toh(array->entry_array.items[i]);
1570                 if (p <= 0)
1571                         return -EBADMSG;
1572
1573                 r = test_object(f, p, needle);
1574                 if (r < 0)
1575                         return r;
1576
1577                 if (r == TEST_FOUND)
1578                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1579
1580                 if (r == TEST_RIGHT) {
1581                         left = 0;
1582                         right -= 1;
1583
1584                         if (last_index != (uint64_t) -1) {
1585                                 assert(last_index <= right);
1586
1587                                 /* If we cached the last index we
1588                                  * looked at, let's try to not to jump
1589                                  * too wildly around and see if we can
1590                                  * limit the range to look at early to
1591                                  * the immediate neighbors of the last
1592                                  * index we looked at. */
1593
1594                                 if (last_index > 0) {
1595                                         uint64_t x = last_index - 1;
1596
1597                                         p = le64toh(array->entry_array.items[x]);
1598                                         if (p <= 0)
1599                                                 return -EBADMSG;
1600
1601                                         r = test_object(f, p, needle);
1602                                         if (r < 0)
1603                                                 return r;
1604
1605                                         if (r == TEST_FOUND)
1606                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1607
1608                                         if (r == TEST_RIGHT)
1609                                                 right = x;
1610                                         else
1611                                                 left = x + 1;
1612                                 }
1613
1614                                 if (last_index < right) {
1615                                         uint64_t y = last_index + 1;
1616
1617                                         p = le64toh(array->entry_array.items[y]);
1618                                         if (p <= 0)
1619                                                 return -EBADMSG;
1620
1621                                         r = test_object(f, p, needle);
1622                                         if (r < 0)
1623                                                 return r;
1624
1625                                         if (r == TEST_FOUND)
1626                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1627
1628                                         if (r == TEST_RIGHT)
1629                                                 right = y;
1630                                         else
1631                                                 left = y + 1;
1632                                 }
1633                         }
1634
1635                         for (;;) {
1636                                 if (left == right) {
1637                                         if (direction == DIRECTION_UP)
1638                                                 subtract_one = true;
1639
1640                                         i = left;
1641                                         goto found;
1642                                 }
1643
1644                                 assert(left < right);
1645                                 i = (left + right) / 2;
1646
1647                                 p = le64toh(array->entry_array.items[i]);
1648                                 if (p <= 0)
1649                                         return -EBADMSG;
1650
1651                                 r = test_object(f, p, needle);
1652                                 if (r < 0)
1653                                         return r;
1654
1655                                 if (r == TEST_FOUND)
1656                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1657
1658                                 if (r == TEST_RIGHT)
1659                                         right = i;
1660                                 else
1661                                         left = i + 1;
1662                         }
1663                 }
1664
1665                 if (k >= n) {
1666                         if (direction == DIRECTION_UP) {
1667                                 i = n;
1668                                 subtract_one = true;
1669                                 goto found;
1670                         }
1671
1672                         return 0;
1673                 }
1674
1675                 last_p = lp;
1676
1677                 n -= k;
1678                 t += k;
1679                 last_index = (uint64_t) -1;
1680                 a = le64toh(array->entry_array.next_entry_array_offset);
1681         }
1682
1683         return 0;
1684
1685 found:
1686         if (subtract_one && t == 0 && i == 0)
1687                 return 0;
1688
1689         /* Let's cache this item for the next invocation */
1690         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1691
1692         if (subtract_one && i == 0)
1693                 p = last_p;
1694         else if (subtract_one)
1695                 p = le64toh(array->entry_array.items[i-1]);
1696         else
1697                 p = le64toh(array->entry_array.items[i]);
1698
1699         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1700         if (r < 0)
1701                 return r;
1702
1703         if (ret)
1704                 *ret = o;
1705
1706         if (offset)
1707                 *offset = p;
1708
1709         if (idx)
1710                 *idx = t + i + (subtract_one ? -1 : 0);
1711
1712         return 1;
1713 }
1714
1715
1716 static int generic_array_bisect_plus_one(
1717                 JournalFile *f,
1718                 uint64_t extra,
1719                 uint64_t first,
1720                 uint64_t n,
1721                 uint64_t needle,
1722                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723                 direction_t direction,
1724                 Object **ret,
1725                 uint64_t *offset,
1726                 uint64_t *idx) {
1727
1728         int r;
1729         bool step_back = false;
1730         Object *o;
1731
1732         assert(f);
1733         assert(test_object);
1734
1735         if (n <= 0)
1736                 return 0;
1737
1738         /* This bisects the array in object 'first', but first checks
1739          * an extra  */
1740         r = test_object(f, extra, needle);
1741         if (r < 0)
1742                 return r;
1743
1744         if (r == TEST_FOUND)
1745                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1746
1747         /* if we are looking with DIRECTION_UP then we need to first
1748            see if in the actual array there is a matching entry, and
1749            return the last one of that. But if there isn't any we need
1750            to return this one. Hence remember this, and return it
1751            below. */
1752         if (r == TEST_LEFT)
1753                 step_back = direction == DIRECTION_UP;
1754
1755         if (r == TEST_RIGHT) {
1756                 if (direction == DIRECTION_DOWN)
1757                         goto found;
1758                 else
1759                         return 0;
1760         }
1761
1762         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1763
1764         if (r == 0 && step_back)
1765                 goto found;
1766
1767         if (r > 0 && idx)
1768                 (*idx) ++;
1769
1770         return r;
1771
1772 found:
1773         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1774         if (r < 0)
1775                 return r;
1776
1777         if (ret)
1778                 *ret = o;
1779
1780         if (offset)
1781                 *offset = extra;
1782
1783         if (idx)
1784                 *idx = 0;
1785
1786         return 1;
1787 }
1788
1789 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1790         assert(f);
1791         assert(p > 0);
1792
1793         if (p == needle)
1794                 return TEST_FOUND;
1795         else if (p < needle)
1796                 return TEST_LEFT;
1797         else
1798                 return TEST_RIGHT;
1799 }
1800
1801 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1802         Object *o;
1803         int r;
1804
1805         assert(f);
1806         assert(p > 0);
1807
1808         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1809         if (r < 0)
1810                 return r;
1811
1812         if (le64toh(o->entry.seqnum) == needle)
1813                 return TEST_FOUND;
1814         else if (le64toh(o->entry.seqnum) < needle)
1815                 return TEST_LEFT;
1816         else
1817                 return TEST_RIGHT;
1818 }
1819
1820 int journal_file_move_to_entry_by_seqnum(
1821                 JournalFile *f,
1822                 uint64_t seqnum,
1823                 direction_t direction,
1824                 Object **ret,
1825                 uint64_t *offset) {
1826
1827         return generic_array_bisect(f,
1828                                     le64toh(f->header->entry_array_offset),
1829                                     le64toh(f->header->n_entries),
1830                                     seqnum,
1831                                     test_object_seqnum,
1832                                     direction,
1833                                     ret, offset, NULL);
1834 }
1835
1836 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1837         Object *o;
1838         int r;
1839
1840         assert(f);
1841         assert(p > 0);
1842
1843         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1844         if (r < 0)
1845                 return r;
1846
1847         if (le64toh(o->entry.realtime) == needle)
1848                 return TEST_FOUND;
1849         else if (le64toh(o->entry.realtime) < needle)
1850                 return TEST_LEFT;
1851         else
1852                 return TEST_RIGHT;
1853 }
1854
1855 int journal_file_move_to_entry_by_realtime(
1856                 JournalFile *f,
1857                 uint64_t realtime,
1858                 direction_t direction,
1859                 Object **ret,
1860                 uint64_t *offset) {
1861
1862         return generic_array_bisect(f,
1863                                     le64toh(f->header->entry_array_offset),
1864                                     le64toh(f->header->n_entries),
1865                                     realtime,
1866                                     test_object_realtime,
1867                                     direction,
1868                                     ret, offset, NULL);
1869 }
1870
1871 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1872         Object *o;
1873         int r;
1874
1875         assert(f);
1876         assert(p > 0);
1877
1878         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1879         if (r < 0)
1880                 return r;
1881
1882         if (le64toh(o->entry.monotonic) == needle)
1883                 return TEST_FOUND;
1884         else if (le64toh(o->entry.monotonic) < needle)
1885                 return TEST_LEFT;
1886         else
1887                 return TEST_RIGHT;
1888 }
1889
1890 static inline int find_data_object_by_boot_id(
1891                 JournalFile *f,
1892                 sd_id128_t boot_id,
1893                 Object **o,
1894                 uint64_t *b) {
1895         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1896
1897         sd_id128_to_string(boot_id, t + 9);
1898         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1899 }
1900
1901 int journal_file_move_to_entry_by_monotonic(
1902                 JournalFile *f,
1903                 sd_id128_t boot_id,
1904                 uint64_t monotonic,
1905                 direction_t direction,
1906                 Object **ret,
1907                 uint64_t *offset) {
1908
1909         Object *o;
1910         int r;
1911
1912         assert(f);
1913
1914         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1915         if (r < 0)
1916                 return r;
1917         if (r == 0)
1918                 return -ENOENT;
1919
1920         return generic_array_bisect_plus_one(f,
1921                                              le64toh(o->data.entry_offset),
1922                                              le64toh(o->data.entry_array_offset),
1923                                              le64toh(o->data.n_entries),
1924                                              monotonic,
1925                                              test_object_monotonic,
1926                                              direction,
1927                                              ret, offset, NULL);
1928 }
1929
1930 void journal_file_reset_location(JournalFile *f) {
1931         f->location_type = LOCATION_HEAD;
1932         f->current_offset = 0;
1933         f->current_seqnum = 0;
1934         f->current_realtime = 0;
1935         f->current_monotonic = 0;
1936         zero(f->current_boot_id);
1937         f->current_xor_hash = 0;
1938 }
1939
1940 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1941         f->last_direction = direction;
1942         f->location_type = LOCATION_SEEK;
1943         f->current_offset = offset;
1944         f->current_seqnum = le64toh(o->entry.seqnum);
1945         f->current_realtime = le64toh(o->entry.realtime);
1946         f->current_monotonic = le64toh(o->entry.monotonic);
1947         f->current_boot_id = o->entry.boot_id;
1948         f->current_xor_hash = le64toh(o->entry.xor_hash);
1949 }
1950
1951 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1952         assert(af);
1953         assert(bf);
1954         assert(af->location_type == LOCATION_SEEK);
1955         assert(bf->location_type == LOCATION_SEEK);
1956
1957         /* If contents and timestamps match, these entries are
1958          * identical, even if the seqnum does not match */
1959         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1960             af->current_monotonic == bf->current_monotonic &&
1961             af->current_realtime == bf->current_realtime &&
1962             af->current_xor_hash == bf->current_xor_hash)
1963                 return 0;
1964
1965         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1966
1967                 /* If this is from the same seqnum source, compare
1968                  * seqnums */
1969                 if (af->current_seqnum < bf->current_seqnum)
1970                         return -1;
1971                 if (af->current_seqnum > bf->current_seqnum)
1972                         return 1;
1973
1974                 /* Wow! This is weird, different data but the same
1975                  * seqnums? Something is borked, but let's make the
1976                  * best of it and compare by time. */
1977         }
1978
1979         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
1980
1981                 /* If the boot id matches, compare monotonic time */
1982                 if (af->current_monotonic < bf->current_monotonic)
1983                         return -1;
1984                 if (af->current_monotonic > bf->current_monotonic)
1985                         return 1;
1986         }
1987
1988         /* Otherwise, compare UTC time */
1989         if (af->current_realtime < bf->current_realtime)
1990                 return -1;
1991         if (af->current_realtime > bf->current_realtime)
1992                 return 1;
1993
1994         /* Finally, compare by contents */
1995         if (af->current_xor_hash < bf->current_xor_hash)
1996                 return -1;
1997         if (af->current_xor_hash > bf->current_xor_hash)
1998                 return 1;
1999
2000         return 0;
2001 }
2002
2003 int journal_file_next_entry(
2004                 JournalFile *f,
2005                 uint64_t p,
2006                 direction_t direction,
2007                 Object **ret, uint64_t *offset) {
2008
2009         uint64_t i, n, ofs;
2010         int r;
2011
2012         assert(f);
2013
2014         n = le64toh(f->header->n_entries);
2015         if (n <= 0)
2016                 return 0;
2017
2018         if (p == 0)
2019                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2020         else {
2021                 r = generic_array_bisect(f,
2022                                          le64toh(f->header->entry_array_offset),
2023                                          le64toh(f->header->n_entries),
2024                                          p,
2025                                          test_object_offset,
2026                                          DIRECTION_DOWN,
2027                                          NULL, NULL,
2028                                          &i);
2029                 if (r <= 0)
2030                         return r;
2031
2032                 if (direction == DIRECTION_DOWN) {
2033                         if (i >= n - 1)
2034                                 return 0;
2035
2036                         i++;
2037                 } else {
2038                         if (i <= 0)
2039                                 return 0;
2040
2041                         i--;
2042                 }
2043         }
2044
2045         /* And jump to it */
2046         r = generic_array_get(f,
2047                               le64toh(f->header->entry_array_offset),
2048                               i,
2049                               ret, &ofs);
2050         if (r <= 0)
2051                 return r;
2052
2053         if (p > 0 &&
2054             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2055                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2056                           f->path, i);
2057                 return -EBADMSG;
2058         }
2059
2060         if (offset)
2061                 *offset = ofs;
2062
2063         return 1;
2064 }
2065
2066 int journal_file_next_entry_for_data(
2067                 JournalFile *f,
2068                 Object *o, uint64_t p,
2069                 uint64_t data_offset,
2070                 direction_t direction,
2071                 Object **ret, uint64_t *offset) {
2072
2073         uint64_t n, i;
2074         int r;
2075         Object *d;
2076
2077         assert(f);
2078         assert(p > 0 || !o);
2079
2080         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2081         if (r < 0)
2082                 return r;
2083
2084         n = le64toh(d->data.n_entries);
2085         if (n <= 0)
2086                 return n;
2087
2088         if (!o)
2089                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2090         else {
2091                 if (o->object.type != OBJECT_ENTRY)
2092                         return -EINVAL;
2093
2094                 r = generic_array_bisect_plus_one(f,
2095                                                   le64toh(d->data.entry_offset),
2096                                                   le64toh(d->data.entry_array_offset),
2097                                                   le64toh(d->data.n_entries),
2098                                                   p,
2099                                                   test_object_offset,
2100                                                   DIRECTION_DOWN,
2101                                                   NULL, NULL,
2102                                                   &i);
2103
2104                 if (r <= 0)
2105                         return r;
2106
2107                 if (direction == DIRECTION_DOWN) {
2108                         if (i >= n - 1)
2109                                 return 0;
2110
2111                         i++;
2112                 } else {
2113                         if (i <= 0)
2114                                 return 0;
2115
2116                         i--;
2117                 }
2118
2119         }
2120
2121         return generic_array_get_plus_one(f,
2122                                           le64toh(d->data.entry_offset),
2123                                           le64toh(d->data.entry_array_offset),
2124                                           i,
2125                                           ret, offset);
2126 }
2127
2128 int journal_file_move_to_entry_by_offset_for_data(
2129                 JournalFile *f,
2130                 uint64_t data_offset,
2131                 uint64_t p,
2132                 direction_t direction,
2133                 Object **ret, uint64_t *offset) {
2134
2135         int r;
2136         Object *d;
2137
2138         assert(f);
2139
2140         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2141         if (r < 0)
2142                 return r;
2143
2144         return generic_array_bisect_plus_one(f,
2145                                              le64toh(d->data.entry_offset),
2146                                              le64toh(d->data.entry_array_offset),
2147                                              le64toh(d->data.n_entries),
2148                                              p,
2149                                              test_object_offset,
2150                                              direction,
2151                                              ret, offset, NULL);
2152 }
2153
2154 int journal_file_move_to_entry_by_monotonic_for_data(
2155                 JournalFile *f,
2156                 uint64_t data_offset,
2157                 sd_id128_t boot_id,
2158                 uint64_t monotonic,
2159                 direction_t direction,
2160                 Object **ret, uint64_t *offset) {
2161
2162         Object *o, *d;
2163         int r;
2164         uint64_t b, z;
2165
2166         assert(f);
2167
2168         /* First, seek by time */
2169         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2170         if (r < 0)
2171                 return r;
2172         if (r == 0)
2173                 return -ENOENT;
2174
2175         r = generic_array_bisect_plus_one(f,
2176                                           le64toh(o->data.entry_offset),
2177                                           le64toh(o->data.entry_array_offset),
2178                                           le64toh(o->data.n_entries),
2179                                           monotonic,
2180                                           test_object_monotonic,
2181                                           direction,
2182                                           NULL, &z, NULL);
2183         if (r <= 0)
2184                 return r;
2185
2186         /* And now, continue seeking until we find an entry that
2187          * exists in both bisection arrays */
2188
2189         for (;;) {
2190                 Object *qo;
2191                 uint64_t p, q;
2192
2193                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2194                 if (r < 0)
2195                         return r;
2196
2197                 r = generic_array_bisect_plus_one(f,
2198                                                   le64toh(d->data.entry_offset),
2199                                                   le64toh(d->data.entry_array_offset),
2200                                                   le64toh(d->data.n_entries),
2201                                                   z,
2202                                                   test_object_offset,
2203                                                   direction,
2204                                                   NULL, &p, NULL);
2205                 if (r <= 0)
2206                         return r;
2207
2208                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2209                 if (r < 0)
2210                         return r;
2211
2212                 r = generic_array_bisect_plus_one(f,
2213                                                   le64toh(o->data.entry_offset),
2214                                                   le64toh(o->data.entry_array_offset),
2215                                                   le64toh(o->data.n_entries),
2216                                                   p,
2217                                                   test_object_offset,
2218                                                   direction,
2219                                                   &qo, &q, NULL);
2220
2221                 if (r <= 0)
2222                         return r;
2223
2224                 if (p == q) {
2225                         if (ret)
2226                                 *ret = qo;
2227                         if (offset)
2228                                 *offset = q;
2229
2230                         return 1;
2231                 }
2232
2233                 z = q;
2234         }
2235 }
2236
2237 int journal_file_move_to_entry_by_seqnum_for_data(
2238                 JournalFile *f,
2239                 uint64_t data_offset,
2240                 uint64_t seqnum,
2241                 direction_t direction,
2242                 Object **ret, uint64_t *offset) {
2243
2244         Object *d;
2245         int r;
2246
2247         assert(f);
2248
2249         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2250         if (r < 0)
2251                 return r;
2252
2253         return generic_array_bisect_plus_one(f,
2254                                              le64toh(d->data.entry_offset),
2255                                              le64toh(d->data.entry_array_offset),
2256                                              le64toh(d->data.n_entries),
2257                                              seqnum,
2258                                              test_object_seqnum,
2259                                              direction,
2260                                              ret, offset, NULL);
2261 }
2262
2263 int journal_file_move_to_entry_by_realtime_for_data(
2264                 JournalFile *f,
2265                 uint64_t data_offset,
2266                 uint64_t realtime,
2267                 direction_t direction,
2268                 Object **ret, uint64_t *offset) {
2269
2270         Object *d;
2271         int r;
2272
2273         assert(f);
2274
2275         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2276         if (r < 0)
2277                 return r;
2278
2279         return generic_array_bisect_plus_one(f,
2280                                              le64toh(d->data.entry_offset),
2281                                              le64toh(d->data.entry_array_offset),
2282                                              le64toh(d->data.n_entries),
2283                                              realtime,
2284                                              test_object_realtime,
2285                                              direction,
2286                                              ret, offset, NULL);
2287 }
2288
2289 void journal_file_dump(JournalFile *f) {
2290         Object *o;
2291         int r;
2292         uint64_t p;
2293
2294         assert(f);
2295
2296         journal_file_print_header(f);
2297
2298         p = le64toh(f->header->header_size);
2299         while (p != 0) {
2300                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2301                 if (r < 0)
2302                         goto fail;
2303
2304                 switch (o->object.type) {
2305
2306                 case OBJECT_UNUSED:
2307                         printf("Type: OBJECT_UNUSED\n");
2308                         break;
2309
2310                 case OBJECT_DATA:
2311                         printf("Type: OBJECT_DATA\n");
2312                         break;
2313
2314                 case OBJECT_FIELD:
2315                         printf("Type: OBJECT_FIELD\n");
2316                         break;
2317
2318                 case OBJECT_ENTRY:
2319                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2320                                le64toh(o->entry.seqnum),
2321                                le64toh(o->entry.monotonic),
2322                                le64toh(o->entry.realtime));
2323                         break;
2324
2325                 case OBJECT_FIELD_HASH_TABLE:
2326                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2327                         break;
2328
2329                 case OBJECT_DATA_HASH_TABLE:
2330                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2331                         break;
2332
2333                 case OBJECT_ENTRY_ARRAY:
2334                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2335                         break;
2336
2337                 case OBJECT_TAG:
2338                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2339                                le64toh(o->tag.seqnum),
2340                                le64toh(o->tag.epoch));
2341                         break;
2342
2343                 default:
2344                         printf("Type: unknown (%u)\n", o->object.type);
2345                         break;
2346                 }
2347
2348                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2349                         printf("Flags: %s\n",
2350                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2351
2352                 if (p == le64toh(f->header->tail_object_offset))
2353                         p = 0;
2354                 else
2355                         p = p + ALIGN64(le64toh(o->object.size));
2356         }
2357
2358         return;
2359 fail:
2360         log_error("File corrupt");
2361 }
2362
2363 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2364         const char *x;
2365
2366         x = format_timestamp(buf, l, t);
2367         if (x)
2368                 return x;
2369         return " --- ";
2370 }
2371
2372 void journal_file_print_header(JournalFile *f) {
2373         char a[33], b[33], c[33], d[33];
2374         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2375         struct stat st;
2376         char bytes[FORMAT_BYTES_MAX];
2377
2378         assert(f);
2379
2380         printf("File Path: %s\n"
2381                "File ID: %s\n"
2382                "Machine ID: %s\n"
2383                "Boot ID: %s\n"
2384                "Sequential Number ID: %s\n"
2385                "State: %s\n"
2386                "Compatible Flags:%s%s\n"
2387                "Incompatible Flags:%s%s%s\n"
2388                "Header size: %"PRIu64"\n"
2389                "Arena size: %"PRIu64"\n"
2390                "Data Hash Table Size: %"PRIu64"\n"
2391                "Field Hash Table Size: %"PRIu64"\n"
2392                "Rotate Suggested: %s\n"
2393                "Head Sequential Number: %"PRIu64"\n"
2394                "Tail Sequential Number: %"PRIu64"\n"
2395                "Head Realtime Timestamp: %s\n"
2396                "Tail Realtime Timestamp: %s\n"
2397                "Tail Monotonic Timestamp: %s\n"
2398                "Objects: %"PRIu64"\n"
2399                "Entry Objects: %"PRIu64"\n",
2400                f->path,
2401                sd_id128_to_string(f->header->file_id, a),
2402                sd_id128_to_string(f->header->machine_id, b),
2403                sd_id128_to_string(f->header->boot_id, c),
2404                sd_id128_to_string(f->header->seqnum_id, d),
2405                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2406                f->header->state == STATE_ONLINE ? "ONLINE" :
2407                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2408                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2409                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2410                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2411                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2412                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2413                le64toh(f->header->header_size),
2414                le64toh(f->header->arena_size),
2415                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2416                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2417                yes_no(journal_file_rotate_suggested(f, 0)),
2418                le64toh(f->header->head_entry_seqnum),
2419                le64toh(f->header->tail_entry_seqnum),
2420                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2421                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2422                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2423                le64toh(f->header->n_objects),
2424                le64toh(f->header->n_entries));
2425
2426         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2427                 printf("Data Objects: %"PRIu64"\n"
2428                        "Data Hash Table Fill: %.1f%%\n",
2429                        le64toh(f->header->n_data),
2430                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2431
2432         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2433                 printf("Field Objects: %"PRIu64"\n"
2434                        "Field Hash Table Fill: %.1f%%\n",
2435                        le64toh(f->header->n_fields),
2436                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2437
2438         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2439                 printf("Tag Objects: %"PRIu64"\n",
2440                        le64toh(f->header->n_tags));
2441         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2442                 printf("Entry Array Objects: %"PRIu64"\n",
2443                        le64toh(f->header->n_entry_arrays));
2444
2445         if (fstat(f->fd, &st) >= 0)
2446                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2447 }
2448
2449 int journal_file_open(
2450                 const char *fname,
2451                 int flags,
2452                 mode_t mode,
2453                 bool compress,
2454                 bool seal,
2455                 JournalMetrics *metrics,
2456                 MMapCache *mmap_cache,
2457                 JournalFile *template,
2458                 JournalFile **ret) {
2459
2460         JournalFile *f;
2461         int r;
2462         bool newly_created = false;
2463
2464         assert(fname);
2465         assert(ret);
2466
2467         if ((flags & O_ACCMODE) != O_RDONLY &&
2468             (flags & O_ACCMODE) != O_RDWR)
2469                 return -EINVAL;
2470
2471         if (!endswith(fname, ".journal") &&
2472             !endswith(fname, ".journal~"))
2473                 return -EINVAL;
2474
2475         f = new0(JournalFile, 1);
2476         if (!f)
2477                 return -ENOMEM;
2478
2479         f->fd = -1;
2480         f->mode = mode;
2481
2482         f->flags = flags;
2483         f->prot = prot_from_flags(flags);
2484         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2485 #if defined(HAVE_LZ4)
2486         f->compress_lz4 = compress;
2487 #elif defined(HAVE_XZ)
2488         f->compress_xz = compress;
2489 #endif
2490 #ifdef HAVE_GCRYPT
2491         f->seal = seal;
2492 #endif
2493
2494         if (mmap_cache)
2495                 f->mmap = mmap_cache_ref(mmap_cache);
2496         else {
2497                 f->mmap = mmap_cache_new();
2498                 if (!f->mmap) {
2499                         r = -ENOMEM;
2500                         goto fail;
2501                 }
2502         }
2503
2504         f->path = strdup(fname);
2505         if (!f->path) {
2506                 r = -ENOMEM;
2507                 goto fail;
2508         }
2509
2510         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2511         if (!f->chain_cache) {
2512                 r = -ENOMEM;
2513                 goto fail;
2514         }
2515
2516         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2517         if (f->fd < 0) {
2518                 r = -errno;
2519                 goto fail;
2520         }
2521
2522         if (fstat(f->fd, &f->last_stat) < 0) {
2523                 r = -errno;
2524                 goto fail;
2525         }
2526
2527         if (f->last_stat.st_size == 0 && f->writable) {
2528                 /* Let's attach the creation time to the journal file,
2529                  * so that the vacuuming code knows the age of this
2530                  * file even if the file might end up corrupted one
2531                  * day... Ideally we'd just use the creation time many
2532                  * file systems maintain for each file, but there is
2533                  * currently no usable API to query this, hence let's
2534                  * emulate this via extended attributes. If extended
2535                  * attributes are not supported we'll just skip this,
2536                  * and rely solely on mtime/atime/ctime of the file. */
2537
2538                 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2539
2540 #ifdef HAVE_GCRYPT
2541                 /* Try to load the FSPRG state, and if we can't, then
2542                  * just don't do sealing */
2543                 if (f->seal) {
2544                         r = journal_file_fss_load(f);
2545                         if (r < 0)
2546                                 f->seal = false;
2547                 }
2548 #endif
2549
2550                 r = journal_file_init_header(f, template);
2551                 if (r < 0)
2552                         goto fail;
2553
2554                 if (fstat(f->fd, &f->last_stat) < 0) {
2555                         r = -errno;
2556                         goto fail;
2557                 }
2558
2559                 newly_created = true;
2560         }
2561
2562         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2563                 r = -EIO;
2564                 goto fail;
2565         }
2566
2567         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2568         if (f->header == MAP_FAILED) {
2569                 f->header = NULL;
2570                 r = -errno;
2571                 goto fail;
2572         }
2573
2574         if (!newly_created) {
2575                 r = journal_file_verify_header(f);
2576                 if (r < 0)
2577                         goto fail;
2578         }
2579
2580 #ifdef HAVE_GCRYPT
2581         if (!newly_created && f->writable) {
2582                 r = journal_file_fss_load(f);
2583                 if (r < 0)
2584                         goto fail;
2585         }
2586 #endif
2587
2588         if (f->writable) {
2589                 if (metrics) {
2590                         journal_default_metrics(metrics, f->fd);
2591                         f->metrics = *metrics;
2592                 } else if (template)
2593                         f->metrics = template->metrics;
2594
2595                 r = journal_file_refresh_header(f);
2596                 if (r < 0)
2597                         goto fail;
2598         }
2599
2600 #ifdef HAVE_GCRYPT
2601         r = journal_file_hmac_setup(f);
2602         if (r < 0)
2603                 goto fail;
2604 #endif
2605
2606         if (newly_created) {
2607                 r = journal_file_setup_field_hash_table(f);
2608                 if (r < 0)
2609                         goto fail;
2610
2611                 r = journal_file_setup_data_hash_table(f);
2612                 if (r < 0)
2613                         goto fail;
2614
2615 #ifdef HAVE_GCRYPT
2616                 r = journal_file_append_first_tag(f);
2617                 if (r < 0)
2618                         goto fail;
2619 #endif
2620         }
2621
2622         r = journal_file_map_field_hash_table(f);
2623         if (r < 0)
2624                 goto fail;
2625
2626         r = journal_file_map_data_hash_table(f);
2627         if (r < 0)
2628                 goto fail;
2629
2630         *ret = f;
2631         return 0;
2632
2633 fail:
2634         journal_file_close(f);
2635
2636         return r;
2637 }
2638
2639 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2640         _cleanup_free_ char *p = NULL;
2641         size_t l;
2642         JournalFile *old_file, *new_file = NULL;
2643         int r;
2644
2645         assert(f);
2646         assert(*f);
2647
2648         old_file = *f;
2649
2650         if (!old_file->writable)
2651                 return -EINVAL;
2652
2653         if (!endswith(old_file->path, ".journal"))
2654                 return -EINVAL;
2655
2656         l = strlen(old_file->path);
2657         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2658                      (int) l - 8, old_file->path,
2659                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2660                      le64toh((*f)->header->head_entry_seqnum),
2661                      le64toh((*f)->header->head_entry_realtime));
2662         if (r < 0)
2663                 return -ENOMEM;
2664
2665         r = rename(old_file->path, p);
2666         if (r < 0)
2667                 return -errno;
2668
2669         old_file->header->state = STATE_ARCHIVED;
2670
2671         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2672         journal_file_close(old_file);
2673
2674         *f = new_file;
2675         return r;
2676 }
2677
2678 int journal_file_open_reliably(
2679                 const char *fname,
2680                 int flags,
2681                 mode_t mode,
2682                 bool compress,
2683                 bool seal,
2684                 JournalMetrics *metrics,
2685                 MMapCache *mmap_cache,
2686                 JournalFile *template,
2687                 JournalFile **ret) {
2688
2689         int r;
2690         size_t l;
2691         _cleanup_free_ char *p = NULL;
2692
2693         r = journal_file_open(fname, flags, mode, compress, seal,
2694                               metrics, mmap_cache, template, ret);
2695         if (r != -EBADMSG && /* corrupted */
2696             r != -ENODATA && /* truncated */
2697             r != -EHOSTDOWN && /* other machine */
2698             r != -EPROTONOSUPPORT && /* incompatible feature */
2699             r != -EBUSY && /* unclean shutdown */
2700             r != -ESHUTDOWN /* already archived */)
2701                 return r;
2702
2703         if ((flags & O_ACCMODE) == O_RDONLY)
2704                 return r;
2705
2706         if (!(flags & O_CREAT))
2707                 return r;
2708
2709         if (!endswith(fname, ".journal"))
2710                 return r;
2711
2712         /* The file is corrupted. Rotate it away and try it again (but only once) */
2713
2714         l = strlen(fname);
2715         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2716                      (int) l - 8, fname,
2717                      (unsigned long long) now(CLOCK_REALTIME),
2718                      random_u64()) < 0)
2719                 return -ENOMEM;
2720
2721         r = rename(fname, p);
2722         if (r < 0)
2723                 return -errno;
2724
2725         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2726
2727         return journal_file_open(fname, flags, mode, compress, seal,
2728                                  metrics, mmap_cache, template, ret);
2729 }
2730
2731 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2732         uint64_t i, n;
2733         uint64_t q, xor_hash = 0;
2734         int r;
2735         EntryItem *items;
2736         dual_timestamp ts;
2737
2738         assert(from);
2739         assert(to);
2740         assert(o);
2741         assert(p);
2742
2743         if (!to->writable)
2744                 return -EPERM;
2745
2746         ts.monotonic = le64toh(o->entry.monotonic);
2747         ts.realtime = le64toh(o->entry.realtime);
2748
2749         n = journal_file_entry_n_items(o);
2750         /* alloca() can't take 0, hence let's allocate at least one */
2751         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2752
2753         for (i = 0; i < n; i++) {
2754                 uint64_t l, h;
2755                 le64_t le_hash;
2756                 size_t t;
2757                 void *data;
2758                 Object *u;
2759
2760                 q = le64toh(o->entry.items[i].object_offset);
2761                 le_hash = o->entry.items[i].hash;
2762
2763                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2764                 if (r < 0)
2765                         return r;
2766
2767                 if (le_hash != o->data.hash)
2768                         return -EBADMSG;
2769
2770                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2771                 t = (size_t) l;
2772
2773                 /* We hit the limit on 32bit machines */
2774                 if ((uint64_t) t != l)
2775                         return -E2BIG;
2776
2777                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2778 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2779                         size_t rsize;
2780
2781                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2782                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2783                         if (r < 0)
2784                                 return r;
2785
2786                         data = from->compress_buffer;
2787                         l = rsize;
2788 #else
2789                         return -EPROTONOSUPPORT;
2790 #endif
2791                 } else
2792                         data = o->data.payload;
2793
2794                 r = journal_file_append_data(to, data, l, &u, &h);
2795                 if (r < 0)
2796                         return r;
2797
2798                 xor_hash ^= le64toh(u->data.hash);
2799                 items[i].object_offset = htole64(h);
2800                 items[i].hash = u->data.hash;
2801
2802                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2803                 if (r < 0)
2804                         return r;
2805         }
2806
2807         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2808 }
2809
2810 void journal_default_metrics(JournalMetrics *m, int fd) {
2811         uint64_t fs_size = 0;
2812         struct statvfs ss;
2813         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2814
2815         assert(m);
2816         assert(fd >= 0);
2817
2818         if (fstatvfs(fd, &ss) >= 0)
2819                 fs_size = ss.f_frsize * ss.f_blocks;
2820
2821         if (m->max_use == (uint64_t) -1) {
2822
2823                 if (fs_size > 0) {
2824                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2825
2826                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2827                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2828
2829                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2830                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2831                 } else
2832                         m->max_use = DEFAULT_MAX_USE_LOWER;
2833         } else {
2834                 m->max_use = PAGE_ALIGN(m->max_use);
2835
2836                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2837                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2838         }
2839
2840         if (m->max_size == (uint64_t) -1) {
2841                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2842
2843                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2844                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2845         } else
2846                 m->max_size = PAGE_ALIGN(m->max_size);
2847
2848         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2849                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2850
2851         if (m->max_size*2 > m->max_use)
2852                 m->max_use = m->max_size*2;
2853
2854         if (m->min_size == (uint64_t) -1)
2855                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2856         else {
2857                 m->min_size = PAGE_ALIGN(m->min_size);
2858
2859                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2860                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2861
2862                 if (m->min_size > m->max_size)
2863                         m->max_size = m->min_size;
2864         }
2865
2866         if (m->keep_free == (uint64_t) -1) {
2867
2868                 if (fs_size > 0) {
2869                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2870
2871                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2872                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2873
2874                 } else
2875                         m->keep_free = DEFAULT_KEEP_FREE;
2876         }
2877
2878         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2879                   format_bytes(a, sizeof(a), m->max_use),
2880                   format_bytes(b, sizeof(b), m->max_size),
2881                   format_bytes(c, sizeof(c), m->min_size),
2882                   format_bytes(d, sizeof(d), m->keep_free));
2883 }
2884
2885 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2886         assert(f);
2887         assert(from || to);
2888
2889         if (from) {
2890                 if (f->header->head_entry_realtime == 0)
2891                         return -ENOENT;
2892
2893                 *from = le64toh(f->header->head_entry_realtime);
2894         }
2895
2896         if (to) {
2897                 if (f->header->tail_entry_realtime == 0)
2898                         return -ENOENT;
2899
2900                 *to = le64toh(f->header->tail_entry_realtime);
2901         }
2902
2903         return 1;
2904 }
2905
2906 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2907         Object *o;
2908         uint64_t p;
2909         int r;
2910
2911         assert(f);
2912         assert(from || to);
2913
2914         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2915         if (r <= 0)
2916                 return r;
2917
2918         if (le64toh(o->data.n_entries) <= 0)
2919                 return 0;
2920
2921         if (from) {
2922                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2923                 if (r < 0)
2924                         return r;
2925
2926                 *from = le64toh(o->entry.monotonic);
2927         }
2928
2929         if (to) {
2930                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2931                 if (r < 0)
2932                         return r;
2933
2934                 r = generic_array_get_plus_one(f,
2935                                                le64toh(o->data.entry_offset),
2936                                                le64toh(o->data.entry_array_offset),
2937                                                le64toh(o->data.n_entries)-1,
2938                                                &o, NULL);
2939                 if (r <= 0)
2940                         return r;
2941
2942                 *to = le64toh(o->entry.monotonic);
2943         }
2944
2945         return 1;
2946 }
2947
2948 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2949         assert(f);
2950
2951         /* If we gained new header fields we gained new features,
2952          * hence suggest a rotation */
2953         if (le64toh(f->header->header_size) < sizeof(Header)) {
2954                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2955                 return true;
2956         }
2957
2958         /* Let's check if the hash tables grew over a certain fill
2959          * level (75%, borrowing this value from Java's hash table
2960          * implementation), and if so suggest a rotation. To calculate
2961          * the fill level we need the n_data field, which only exists
2962          * in newer versions. */
2963
2964         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2965                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2966                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2967                                   f->path,
2968                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2969                                   le64toh(f->header->n_data),
2970                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2971                                   (unsigned long long) f->last_stat.st_size,
2972                                   f->last_stat.st_size / le64toh(f->header->n_data));
2973                         return true;
2974                 }
2975
2976         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2977                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2978                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2979                                   f->path,
2980                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2981                                   le64toh(f->header->n_fields),
2982                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2983                         return true;
2984                 }
2985
2986         /* Are the data objects properly indexed by field objects? */
2987         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2988             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2989             le64toh(f->header->n_data) > 0 &&
2990             le64toh(f->header->n_fields) == 0)
2991                 return true;
2992
2993         if (max_file_usec > 0) {
2994                 usec_t t, h;
2995
2996                 h = le64toh(f->header->head_entry_realtime);
2997                 t = now(CLOCK_REALTIME);
2998
2999                 if (h > 0 && t > h + max_file_usec)
3000                         return true;
3001         }
3002
3003         return false;
3004 }