chiark / gitweb /
44a96928e027dade0b915054fecaf8c0c1d63b1b
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
66
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
69
70 /* The mmap context to use for the header we pick as one above the last defined typed */
71 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
72
73 static int journal_file_set_online(JournalFile *f) {
74         assert(f);
75
76         if (!f->writable)
77                 return -EPERM;
78
79         if (!(f->fd >= 0 && f->header))
80                 return -EINVAL;
81
82         if (mmap_cache_got_sigbus(f->mmap, f->fd))
83                 return -EIO;
84
85         switch(f->header->state) {
86                 case STATE_ONLINE:
87                         return 0;
88
89                 case STATE_OFFLINE:
90                         f->header->state = STATE_ONLINE;
91                         fsync(f->fd);
92                         return 0;
93
94                 default:
95                         return -EINVAL;
96         }
97 }
98
99 int journal_file_set_offline(JournalFile *f) {
100         assert(f);
101
102         if (!f->writable)
103                 return -EPERM;
104
105         if (!(f->fd >= 0 && f->header))
106                 return -EINVAL;
107
108         if (f->header->state != STATE_ONLINE)
109                 return 0;
110
111         fsync(f->fd);
112
113         if (mmap_cache_got_sigbus(f->mmap, f->fd))
114                 return -EIO;
115
116         f->header->state = STATE_OFFLINE;
117
118         if (mmap_cache_got_sigbus(f->mmap, f->fd))
119                 return -EIO;
120
121         fsync(f->fd);
122
123         return 0;
124 }
125
126 void journal_file_close(JournalFile *f) {
127         assert(f);
128
129 #ifdef HAVE_GCRYPT
130         /* Write the final tag */
131         if (f->seal && f->writable)
132                 journal_file_append_tag(f);
133 #endif
134
135         journal_file_set_offline(f);
136
137         if (f->mmap && f->fd >= 0)
138                 mmap_cache_close_fd(f->mmap, f->fd);
139
140         safe_close(f->fd);
141         free(f->path);
142
143         if (f->mmap)
144                 mmap_cache_unref(f->mmap);
145
146         ordered_hashmap_free_free(f->chain_cache);
147
148 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
149         free(f->compress_buffer);
150 #endif
151
152 #ifdef HAVE_GCRYPT
153         if (f->fss_file)
154                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
155         else if (f->fsprg_state)
156                 free(f->fsprg_state);
157
158         free(f->fsprg_seed);
159
160         if (f->hmac)
161                 gcry_md_close(f->hmac);
162 #endif
163
164         free(f);
165 }
166
167 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
168         Header h = {};
169         ssize_t k;
170         int r;
171
172         assert(f);
173
174         memcpy(h.signature, HEADER_SIGNATURE, 8);
175         h.header_size = htole64(ALIGN64(sizeof(h)));
176
177         h.incompatible_flags |= htole32(
178                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
179                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
180
181         h.compatible_flags = htole32(
182                 f->seal * HEADER_COMPATIBLE_SEALED);
183
184         r = sd_id128_randomize(&h.file_id);
185         if (r < 0)
186                 return r;
187
188         if (template) {
189                 h.seqnum_id = template->header->seqnum_id;
190                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
191         } else
192                 h.seqnum_id = h.file_id;
193
194         k = pwrite(f->fd, &h, sizeof(h), 0);
195         if (k < 0)
196                 return -errno;
197
198         if (k != sizeof(h))
199                 return -EIO;
200
201         return 0;
202 }
203
204 static int journal_file_refresh_header(JournalFile *f) {
205         sd_id128_t boot_id;
206         int r;
207
208         assert(f);
209
210         r = sd_id128_get_machine(&f->header->machine_id);
211         if (r < 0)
212                 return r;
213
214         r = sd_id128_get_boot(&boot_id);
215         if (r < 0)
216                 return r;
217
218         if (sd_id128_equal(boot_id, f->header->boot_id))
219                 f->tail_entry_monotonic_valid = true;
220
221         f->header->boot_id = boot_id;
222
223         r = journal_file_set_online(f);
224
225         /* Sync the online state to disk */
226         fsync(f->fd);
227
228         return r;
229 }
230
231 static int journal_file_verify_header(JournalFile *f) {
232         uint32_t flags;
233
234         assert(f);
235
236         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
237                 return -EBADMSG;
238
239         /* In both read and write mode we refuse to open files with
240          * incompatible flags we don't know */
241         flags = le32toh(f->header->incompatible_flags);
242         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
243                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
244                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
245                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
246                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
247                 if (flags)
248                         log_debug("Journal file %s uses incompatible flags %"PRIx32
249                                   " disabled at compilation time.", f->path, flags);
250                 return -EPROTONOSUPPORT;
251         }
252
253         /* When open for writing we refuse to open files with
254          * compatible flags, too */
255         flags = le32toh(f->header->compatible_flags);
256         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
257                 if (flags & ~HEADER_COMPATIBLE_ANY)
258                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
259                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
260                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
261                 if (flags)
262                         log_debug("Journal file %s uses compatible flags %"PRIx32
263                                   " disabled at compilation time.", f->path, flags);
264                 return -EPROTONOSUPPORT;
265         }
266
267         if (f->header->state >= _STATE_MAX)
268                 return -EBADMSG;
269
270         /* The first addition was n_data, so check that we are at least this large */
271         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
272                 return -EBADMSG;
273
274         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
275                 return -EBADMSG;
276
277         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
278                 return -ENODATA;
279
280         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
281                 return -ENODATA;
282
283         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
284             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
285             !VALID64(le64toh(f->header->tail_object_offset)) ||
286             !VALID64(le64toh(f->header->entry_array_offset)))
287                 return -ENODATA;
288
289         if (f->writable) {
290                 uint8_t state;
291                 sd_id128_t machine_id;
292                 int r;
293
294                 r = sd_id128_get_machine(&machine_id);
295                 if (r < 0)
296                         return r;
297
298                 if (!sd_id128_equal(machine_id, f->header->machine_id))
299                         return -EHOSTDOWN;
300
301                 state = f->header->state;
302
303                 if (state == STATE_ONLINE) {
304                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
305                         return -EBUSY;
306                 } else if (state == STATE_ARCHIVED)
307                         return -ESHUTDOWN;
308                 else if (state != STATE_OFFLINE) {
309                         log_debug("Journal file %s has unknown state %u.", f->path, state);
310                         return -EBUSY;
311                 }
312         }
313
314         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
315         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
316
317         f->seal = JOURNAL_HEADER_SEALED(f->header);
318
319         return 0;
320 }
321
322 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
323         uint64_t old_size, new_size;
324         int r;
325
326         assert(f);
327
328         /* We assume that this file is not sparse, and we know that
329          * for sure, since we always call posix_fallocate()
330          * ourselves */
331
332         if (mmap_cache_got_sigbus(f->mmap, f->fd))
333                 return -EIO;
334
335         old_size =
336                 le64toh(f->header->header_size) +
337                 le64toh(f->header->arena_size);
338
339         new_size = PAGE_ALIGN(offset + size);
340         if (new_size < le64toh(f->header->header_size))
341                 new_size = le64toh(f->header->header_size);
342
343         if (new_size <= old_size)
344                 return 0;
345
346         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
347                 return -E2BIG;
348
349         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
350                 struct statvfs svfs;
351
352                 if (fstatvfs(f->fd, &svfs) >= 0) {
353                         uint64_t available;
354
355                         available = svfs.f_bfree * svfs.f_bsize;
356
357                         if (available >= f->metrics.keep_free)
358                                 available -= f->metrics.keep_free;
359                         else
360                                 available = 0;
361
362                         if (new_size - old_size > available)
363                                 return -E2BIG;
364                 }
365         }
366
367         /* Increase by larger blocks at once */
368         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
370                 new_size = f->metrics.max_size;
371
372         /* Note that the glibc fallocate() fallback is very
373            inefficient, hence we try to minimize the allocation area
374            as we can. */
375         r = posix_fallocate(f->fd, old_size, new_size - old_size);
376         if (r != 0)
377                 return -r;
378
379         if (fstat(f->fd, &f->last_stat) < 0)
380                 return -errno;
381
382         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
383
384         return 0;
385 }
386
387 static unsigned type_to_context(ObjectType type) {
388         /* One context for each type, plus one catch-all for the rest */
389         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
390         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
391         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
392 }
393
394 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
395         assert(f);
396         assert(ret);
397
398         if (size <= 0)
399                 return -EINVAL;
400
401         /* Avoid SIGBUS on invalid accesses */
402         if (offset + size > (uint64_t) f->last_stat.st_size) {
403                 /* Hmm, out of range? Let's refresh the fstat() data
404                  * first, before we trust that check. */
405
406                 if (fstat(f->fd, &f->last_stat) < 0 ||
407                     offset + size > (uint64_t) f->last_stat.st_size)
408                         return -EADDRNOTAVAIL;
409         }
410
411         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
412 }
413
414 static uint64_t minimum_header_size(Object *o) {
415
416         static const uint64_t table[] = {
417                 [OBJECT_DATA] = sizeof(DataObject),
418                 [OBJECT_FIELD] = sizeof(FieldObject),
419                 [OBJECT_ENTRY] = sizeof(EntryObject),
420                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
421                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
422                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
423                 [OBJECT_TAG] = sizeof(TagObject),
424         };
425
426         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
427                 return sizeof(ObjectHeader);
428
429         return table[o->object.type];
430 }
431
432 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
433         int r;
434         void *t;
435         Object *o;
436         uint64_t s;
437
438         assert(f);
439         assert(ret);
440
441         /* Objects may only be located at multiple of 64 bit */
442         if (!VALID64(offset))
443                 return -EFAULT;
444
445         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
446         if (r < 0)
447                 return r;
448
449         o = (Object*) t;
450         s = le64toh(o->object.size);
451
452         if (s < sizeof(ObjectHeader))
453                 return -EBADMSG;
454
455         if (o->object.type <= OBJECT_UNUSED)
456                 return -EBADMSG;
457
458         if (s < minimum_header_size(o))
459                 return -EBADMSG;
460
461         if (type > OBJECT_UNUSED && o->object.type != type)
462                 return -EBADMSG;
463
464         if (s > sizeof(ObjectHeader)) {
465                 r = journal_file_move_to(f, type, false, offset, s, &t);
466                 if (r < 0)
467                         return r;
468
469                 o = (Object*) t;
470         }
471
472         *ret = o;
473         return 0;
474 }
475
476 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
477         uint64_t r;
478
479         assert(f);
480
481         r = le64toh(f->header->tail_entry_seqnum) + 1;
482
483         if (seqnum) {
484                 /* If an external seqnum counter was passed, we update
485                  * both the local and the external one, and set it to
486                  * the maximum of both */
487
488                 if (*seqnum + 1 > r)
489                         r = *seqnum + 1;
490
491                 *seqnum = r;
492         }
493
494         f->header->tail_entry_seqnum = htole64(r);
495
496         if (f->header->head_entry_seqnum == 0)
497                 f->header->head_entry_seqnum = htole64(r);
498
499         return r;
500 }
501
502 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
503         int r;
504         uint64_t p;
505         Object *tail, *o;
506         void *t;
507
508         assert(f);
509         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
510         assert(size >= sizeof(ObjectHeader));
511         assert(offset);
512         assert(ret);
513
514         r = journal_file_set_online(f);
515         if (r < 0)
516                 return r;
517
518         p = le64toh(f->header->tail_object_offset);
519         if (p == 0)
520                 p = le64toh(f->header->header_size);
521         else {
522                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
523                 if (r < 0)
524                         return r;
525
526                 p += ALIGN64(le64toh(tail->object.size));
527         }
528
529         r = journal_file_allocate(f, p, size);
530         if (r < 0)
531                 return r;
532
533         r = journal_file_move_to(f, type, false, p, size, &t);
534         if (r < 0)
535                 return r;
536
537         o = (Object*) t;
538
539         zero(o->object);
540         o->object.type = type;
541         o->object.size = htole64(size);
542
543         f->header->tail_object_offset = htole64(p);
544         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
545
546         *ret = o;
547         *offset = p;
548
549         return 0;
550 }
551
552 static int journal_file_setup_data_hash_table(JournalFile *f) {
553         uint64_t s, p;
554         Object *o;
555         int r;
556
557         assert(f);
558
559         /* We estimate that we need 1 hash table entry per 768 of
560            journal file and we want to make sure we never get beyond
561            75% fill level. Calculate the hash table size for the
562            maximum file size based on these metrics. */
563
564         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
565         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
566                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
567
568         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
569
570         r = journal_file_append_object(f,
571                                        OBJECT_DATA_HASH_TABLE,
572                                        offsetof(Object, hash_table.items) + s,
573                                        &o, &p);
574         if (r < 0)
575                 return r;
576
577         memzero(o->hash_table.items, s);
578
579         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
580         f->header->data_hash_table_size = htole64(s);
581
582         return 0;
583 }
584
585 static int journal_file_setup_field_hash_table(JournalFile *f) {
586         uint64_t s, p;
587         Object *o;
588         int r;
589
590         assert(f);
591
592         /* We use a fixed size hash table for the fields as this
593          * number should grow very slowly only */
594
595         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
596         r = journal_file_append_object(f,
597                                        OBJECT_FIELD_HASH_TABLE,
598                                        offsetof(Object, hash_table.items) + s,
599                                        &o, &p);
600         if (r < 0)
601                 return r;
602
603         memzero(o->hash_table.items, s);
604
605         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
606         f->header->field_hash_table_size = htole64(s);
607
608         return 0;
609 }
610
611 static int journal_file_map_data_hash_table(JournalFile *f) {
612         uint64_t s, p;
613         void *t;
614         int r;
615
616         assert(f);
617
618         p = le64toh(f->header->data_hash_table_offset);
619         s = le64toh(f->header->data_hash_table_size);
620
621         r = journal_file_move_to(f,
622                                  OBJECT_DATA_HASH_TABLE,
623                                  true,
624                                  p, s,
625                                  &t);
626         if (r < 0)
627                 return r;
628
629         f->data_hash_table = t;
630         return 0;
631 }
632
633 static int journal_file_map_field_hash_table(JournalFile *f) {
634         uint64_t s, p;
635         void *t;
636         int r;
637
638         assert(f);
639
640         p = le64toh(f->header->field_hash_table_offset);
641         s = le64toh(f->header->field_hash_table_size);
642
643         r = journal_file_move_to(f,
644                                  OBJECT_FIELD_HASH_TABLE,
645                                  true,
646                                  p, s,
647                                  &t);
648         if (r < 0)
649                 return r;
650
651         f->field_hash_table = t;
652         return 0;
653 }
654
655 static int journal_file_link_field(
656                 JournalFile *f,
657                 Object *o,
658                 uint64_t offset,
659                 uint64_t hash) {
660
661         uint64_t p, h;
662         int r;
663
664         assert(f);
665         assert(o);
666         assert(offset > 0);
667
668         if (o->object.type != OBJECT_FIELD)
669                 return -EINVAL;
670
671         /* This might alter the window we are looking at */
672
673         o->field.next_hash_offset = o->field.head_data_offset = 0;
674
675         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
676         p = le64toh(f->field_hash_table[h].tail_hash_offset);
677         if (p == 0)
678                 f->field_hash_table[h].head_hash_offset = htole64(offset);
679         else {
680                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
681                 if (r < 0)
682                         return r;
683
684                 o->field.next_hash_offset = htole64(offset);
685         }
686
687         f->field_hash_table[h].tail_hash_offset = htole64(offset);
688
689         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
690                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
691
692         return 0;
693 }
694
695 static int journal_file_link_data(
696                 JournalFile *f,
697                 Object *o,
698                 uint64_t offset,
699                 uint64_t hash) {
700
701         uint64_t p, h;
702         int r;
703
704         assert(f);
705         assert(o);
706         assert(offset > 0);
707
708         if (o->object.type != OBJECT_DATA)
709                 return -EINVAL;
710
711         /* This might alter the window we are looking at */
712
713         o->data.next_hash_offset = o->data.next_field_offset = 0;
714         o->data.entry_offset = o->data.entry_array_offset = 0;
715         o->data.n_entries = 0;
716
717         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
718         p = le64toh(f->data_hash_table[h].tail_hash_offset);
719         if (p == 0)
720                 /* Only entry in the hash table is easy */
721                 f->data_hash_table[h].head_hash_offset = htole64(offset);
722         else {
723                 /* Move back to the previous data object, to patch in
724                  * pointer */
725
726                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
727                 if (r < 0)
728                         return r;
729
730                 o->data.next_hash_offset = htole64(offset);
731         }
732
733         f->data_hash_table[h].tail_hash_offset = htole64(offset);
734
735         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
736                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
737
738         return 0;
739 }
740
741 int journal_file_find_field_object_with_hash(
742                 JournalFile *f,
743                 const void *field, uint64_t size, uint64_t hash,
744                 Object **ret, uint64_t *offset) {
745
746         uint64_t p, osize, h;
747         int r;
748
749         assert(f);
750         assert(field && size > 0);
751
752         osize = offsetof(Object, field.payload) + size;
753
754         if (f->header->field_hash_table_size == 0)
755                 return -EBADMSG;
756
757         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
758         p = le64toh(f->field_hash_table[h].head_hash_offset);
759
760         while (p > 0) {
761                 Object *o;
762
763                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
764                 if (r < 0)
765                         return r;
766
767                 if (le64toh(o->field.hash) == hash &&
768                     le64toh(o->object.size) == osize &&
769                     memcmp(o->field.payload, field, size) == 0) {
770
771                         if (ret)
772                                 *ret = o;
773                         if (offset)
774                                 *offset = p;
775
776                         return 1;
777                 }
778
779                 p = le64toh(o->field.next_hash_offset);
780         }
781
782         return 0;
783 }
784
785 int journal_file_find_field_object(
786                 JournalFile *f,
787                 const void *field, uint64_t size,
788                 Object **ret, uint64_t *offset) {
789
790         uint64_t hash;
791
792         assert(f);
793         assert(field && size > 0);
794
795         hash = hash64(field, size);
796
797         return journal_file_find_field_object_with_hash(f,
798                                                         field, size, hash,
799                                                         ret, offset);
800 }
801
802 int journal_file_find_data_object_with_hash(
803                 JournalFile *f,
804                 const void *data, uint64_t size, uint64_t hash,
805                 Object **ret, uint64_t *offset) {
806
807         uint64_t p, osize, h;
808         int r;
809
810         assert(f);
811         assert(data || size == 0);
812
813         osize = offsetof(Object, data.payload) + size;
814
815         if (f->header->data_hash_table_size == 0)
816                 return -EBADMSG;
817
818         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
819         p = le64toh(f->data_hash_table[h].head_hash_offset);
820
821         while (p > 0) {
822                 Object *o;
823
824                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
825                 if (r < 0)
826                         return r;
827
828                 if (le64toh(o->data.hash) != hash)
829                         goto next;
830
831                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
832 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
833                         uint64_t l;
834                         size_t rsize;
835
836                         l = le64toh(o->object.size);
837                         if (l <= offsetof(Object, data.payload))
838                                 return -EBADMSG;
839
840                         l -= offsetof(Object, data.payload);
841
842                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
843                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
844                         if (r < 0)
845                                 return r;
846
847                         if (rsize == size &&
848                             memcmp(f->compress_buffer, data, size) == 0) {
849
850                                 if (ret)
851                                         *ret = o;
852
853                                 if (offset)
854                                         *offset = p;
855
856                                 return 1;
857                         }
858 #else
859                         return -EPROTONOSUPPORT;
860 #endif
861                 } else if (le64toh(o->object.size) == osize &&
862                            memcmp(o->data.payload, data, size) == 0) {
863
864                         if (ret)
865                                 *ret = o;
866
867                         if (offset)
868                                 *offset = p;
869
870                         return 1;
871                 }
872
873         next:
874                 p = le64toh(o->data.next_hash_offset);
875         }
876
877         return 0;
878 }
879
880 int journal_file_find_data_object(
881                 JournalFile *f,
882                 const void *data, uint64_t size,
883                 Object **ret, uint64_t *offset) {
884
885         uint64_t hash;
886
887         assert(f);
888         assert(data || size == 0);
889
890         hash = hash64(data, size);
891
892         return journal_file_find_data_object_with_hash(f,
893                                                        data, size, hash,
894                                                        ret, offset);
895 }
896
897 static int journal_file_append_field(
898                 JournalFile *f,
899                 const void *field, uint64_t size,
900                 Object **ret, uint64_t *offset) {
901
902         uint64_t hash, p;
903         uint64_t osize;
904         Object *o;
905         int r;
906
907         assert(f);
908         assert(field && size > 0);
909
910         hash = hash64(field, size);
911
912         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
913         if (r < 0)
914                 return r;
915         else if (r > 0) {
916
917                 if (ret)
918                         *ret = o;
919
920                 if (offset)
921                         *offset = p;
922
923                 return 0;
924         }
925
926         osize = offsetof(Object, field.payload) + size;
927         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
928         if (r < 0)
929                 return r;
930
931         o->field.hash = htole64(hash);
932         memcpy(o->field.payload, field, size);
933
934         r = journal_file_link_field(f, o, p, hash);
935         if (r < 0)
936                 return r;
937
938         /* The linking might have altered the window, so let's
939          * refresh our pointer */
940         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
941         if (r < 0)
942                 return r;
943
944 #ifdef HAVE_GCRYPT
945         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
946         if (r < 0)
947                 return r;
948 #endif
949
950         if (ret)
951                 *ret = o;
952
953         if (offset)
954                 *offset = p;
955
956         return 0;
957 }
958
959 static int journal_file_append_data(
960                 JournalFile *f,
961                 const void *data, uint64_t size,
962                 Object **ret, uint64_t *offset) {
963
964         uint64_t hash, p;
965         uint64_t osize;
966         Object *o;
967         int r, compression = 0;
968         const void *eq;
969
970         assert(f);
971         assert(data || size == 0);
972
973         hash = hash64(data, size);
974
975         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
976         if (r < 0)
977                 return r;
978         else if (r > 0) {
979
980                 if (ret)
981                         *ret = o;
982
983                 if (offset)
984                         *offset = p;
985
986                 return 0;
987         }
988
989         osize = offsetof(Object, data.payload) + size;
990         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
991         if (r < 0)
992                 return r;
993
994         o->data.hash = htole64(hash);
995
996 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
997         if (f->compress_xz &&
998             size >= COMPRESSION_SIZE_THRESHOLD) {
999                 size_t rsize;
1000
1001                 compression = compress_blob(data, size, o->data.payload, &rsize);
1002
1003                 if (compression) {
1004                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1005                         o->object.flags |= compression;
1006
1007                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1008                                   size, rsize, object_compressed_to_string(compression));
1009                 }
1010         }
1011 #endif
1012
1013         if (!compression && size > 0)
1014                 memcpy(o->data.payload, data, size);
1015
1016         r = journal_file_link_data(f, o, p, hash);
1017         if (r < 0)
1018                 return r;
1019
1020         /* The linking might have altered the window, so let's
1021          * refresh our pointer */
1022         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1023         if (r < 0)
1024                 return r;
1025
1026         if (!data)
1027                 eq = NULL;
1028         else
1029                 eq = memchr(data, '=', size);
1030         if (eq && eq > data) {
1031                 Object *fo = NULL;
1032                 uint64_t fp;
1033
1034                 /* Create field object ... */
1035                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1036                 if (r < 0)
1037                         return r;
1038
1039                 /* ... and link it in. */
1040                 o->data.next_field_offset = fo->field.head_data_offset;
1041                 fo->field.head_data_offset = le64toh(p);
1042         }
1043
1044 #ifdef HAVE_GCRYPT
1045         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1046         if (r < 0)
1047                 return r;
1048 #endif
1049
1050         if (ret)
1051                 *ret = o;
1052
1053         if (offset)
1054                 *offset = p;
1055
1056         return 0;
1057 }
1058
1059 uint64_t journal_file_entry_n_items(Object *o) {
1060         assert(o);
1061
1062         if (o->object.type != OBJECT_ENTRY)
1063                 return 0;
1064
1065         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1066 }
1067
1068 uint64_t journal_file_entry_array_n_items(Object *o) {
1069         assert(o);
1070
1071         if (o->object.type != OBJECT_ENTRY_ARRAY)
1072                 return 0;
1073
1074         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1075 }
1076
1077 uint64_t journal_file_hash_table_n_items(Object *o) {
1078         assert(o);
1079
1080         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1081             o->object.type != OBJECT_FIELD_HASH_TABLE)
1082                 return 0;
1083
1084         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1085 }
1086
1087 static int link_entry_into_array(JournalFile *f,
1088                                  le64_t *first,
1089                                  le64_t *idx,
1090                                  uint64_t p) {
1091         int r;
1092         uint64_t n = 0, ap = 0, q, i, a, hidx;
1093         Object *o;
1094
1095         assert(f);
1096         assert(first);
1097         assert(idx);
1098         assert(p > 0);
1099
1100         a = le64toh(*first);
1101         i = hidx = le64toh(*idx);
1102         while (a > 0) {
1103
1104                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1105                 if (r < 0)
1106                         return r;
1107
1108                 n = journal_file_entry_array_n_items(o);
1109                 if (i < n) {
1110                         o->entry_array.items[i] = htole64(p);
1111                         *idx = htole64(hidx + 1);
1112                         return 0;
1113                 }
1114
1115                 i -= n;
1116                 ap = a;
1117                 a = le64toh(o->entry_array.next_entry_array_offset);
1118         }
1119
1120         if (hidx > n)
1121                 n = (hidx+1) * 2;
1122         else
1123                 n = n * 2;
1124
1125         if (n < 4)
1126                 n = 4;
1127
1128         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1129                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1130                                        &o, &q);
1131         if (r < 0)
1132                 return r;
1133
1134 #ifdef HAVE_GCRYPT
1135         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1136         if (r < 0)
1137                 return r;
1138 #endif
1139
1140         o->entry_array.items[i] = htole64(p);
1141
1142         if (ap == 0)
1143                 *first = htole64(q);
1144         else {
1145                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1146                 if (r < 0)
1147                         return r;
1148
1149                 o->entry_array.next_entry_array_offset = htole64(q);
1150         }
1151
1152         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1153                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1154
1155         *idx = htole64(hidx + 1);
1156
1157         return 0;
1158 }
1159
1160 static int link_entry_into_array_plus_one(JournalFile *f,
1161                                           le64_t *extra,
1162                                           le64_t *first,
1163                                           le64_t *idx,
1164                                           uint64_t p) {
1165
1166         int r;
1167
1168         assert(f);
1169         assert(extra);
1170         assert(first);
1171         assert(idx);
1172         assert(p > 0);
1173
1174         if (*idx == 0)
1175                 *extra = htole64(p);
1176         else {
1177                 le64_t i;
1178
1179                 i = htole64(le64toh(*idx) - 1);
1180                 r = link_entry_into_array(f, first, &i, p);
1181                 if (r < 0)
1182                         return r;
1183         }
1184
1185         *idx = htole64(le64toh(*idx) + 1);
1186         return 0;
1187 }
1188
1189 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1190         uint64_t p;
1191         int r;
1192         assert(f);
1193         assert(o);
1194         assert(offset > 0);
1195
1196         p = le64toh(o->entry.items[i].object_offset);
1197         if (p == 0)
1198                 return -EINVAL;
1199
1200         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1201         if (r < 0)
1202                 return r;
1203
1204         return link_entry_into_array_plus_one(f,
1205                                               &o->data.entry_offset,
1206                                               &o->data.entry_array_offset,
1207                                               &o->data.n_entries,
1208                                               offset);
1209 }
1210
1211 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1212         uint64_t n, i;
1213         int r;
1214
1215         assert(f);
1216         assert(o);
1217         assert(offset > 0);
1218
1219         if (o->object.type != OBJECT_ENTRY)
1220                 return -EINVAL;
1221
1222         __sync_synchronize();
1223
1224         /* Link up the entry itself */
1225         r = link_entry_into_array(f,
1226                                   &f->header->entry_array_offset,
1227                                   &f->header->n_entries,
1228                                   offset);
1229         if (r < 0)
1230                 return r;
1231
1232         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1233
1234         if (f->header->head_entry_realtime == 0)
1235                 f->header->head_entry_realtime = o->entry.realtime;
1236
1237         f->header->tail_entry_realtime = o->entry.realtime;
1238         f->header->tail_entry_monotonic = o->entry.monotonic;
1239
1240         f->tail_entry_monotonic_valid = true;
1241
1242         /* Link up the items */
1243         n = journal_file_entry_n_items(o);
1244         for (i = 0; i < n; i++) {
1245                 r = journal_file_link_entry_item(f, o, offset, i);
1246                 if (r < 0)
1247                         return r;
1248         }
1249
1250         return 0;
1251 }
1252
1253 static int journal_file_append_entry_internal(
1254                 JournalFile *f,
1255                 const dual_timestamp *ts,
1256                 uint64_t xor_hash,
1257                 const EntryItem items[], unsigned n_items,
1258                 uint64_t *seqnum,
1259                 Object **ret, uint64_t *offset) {
1260         uint64_t np;
1261         uint64_t osize;
1262         Object *o;
1263         int r;
1264
1265         assert(f);
1266         assert(items || n_items == 0);
1267         assert(ts);
1268
1269         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1270
1271         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1272         if (r < 0)
1273                 return r;
1274
1275         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1276         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1277         o->entry.realtime = htole64(ts->realtime);
1278         o->entry.monotonic = htole64(ts->monotonic);
1279         o->entry.xor_hash = htole64(xor_hash);
1280         o->entry.boot_id = f->header->boot_id;
1281
1282 #ifdef HAVE_GCRYPT
1283         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1284         if (r < 0)
1285                 return r;
1286 #endif
1287
1288         r = journal_file_link_entry(f, o, np);
1289         if (r < 0)
1290                 return r;
1291
1292         if (ret)
1293                 *ret = o;
1294
1295         if (offset)
1296                 *offset = np;
1297
1298         return 0;
1299 }
1300
1301 void journal_file_post_change(JournalFile *f) {
1302         assert(f);
1303
1304         /* inotify() does not receive IN_MODIFY events from file
1305          * accesses done via mmap(). After each access we hence
1306          * trigger IN_MODIFY by truncating the journal file to its
1307          * current size which triggers IN_MODIFY. */
1308
1309         __sync_synchronize();
1310
1311         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1312                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1313 }
1314
1315 static int entry_item_cmp(const void *_a, const void *_b) {
1316         const EntryItem *a = _a, *b = _b;
1317
1318         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1319                 return -1;
1320         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1321                 return 1;
1322         return 0;
1323 }
1324
1325 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1326         unsigned i;
1327         EntryItem *items;
1328         int r;
1329         uint64_t xor_hash = 0;
1330         struct dual_timestamp _ts;
1331
1332         assert(f);
1333         assert(iovec || n_iovec == 0);
1334
1335         if (!ts) {
1336                 dual_timestamp_get(&_ts);
1337                 ts = &_ts;
1338         }
1339
1340         if (f->tail_entry_monotonic_valid &&
1341             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1342                 return -EINVAL;
1343
1344 #ifdef HAVE_GCRYPT
1345         r = journal_file_maybe_append_tag(f, ts->realtime);
1346         if (r < 0)
1347                 return r;
1348 #endif
1349
1350         /* alloca() can't take 0, hence let's allocate at least one */
1351         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1352
1353         for (i = 0; i < n_iovec; i++) {
1354                 uint64_t p;
1355                 Object *o;
1356
1357                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1358                 if (r < 0)
1359                         return r;
1360
1361                 xor_hash ^= le64toh(o->data.hash);
1362                 items[i].object_offset = htole64(p);
1363                 items[i].hash = o->data.hash;
1364         }
1365
1366         /* Order by the position on disk, in order to improve seek
1367          * times for rotating media. */
1368         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1369
1370         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1371
1372         /* If the memory mapping triggered a SIGBUS then we return an
1373          * IO error and ignore the error code passed down to us, since
1374          * it is very likely just an effect of a nullified replacement
1375          * mapping page */
1376
1377         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1378                 r = -EIO;
1379
1380         journal_file_post_change(f);
1381
1382         return r;
1383 }
1384
1385 typedef struct ChainCacheItem {
1386         uint64_t first; /* the array at the beginning of the chain */
1387         uint64_t array; /* the cached array */
1388         uint64_t begin; /* the first item in the cached array */
1389         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1390         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1391 } ChainCacheItem;
1392
1393 static void chain_cache_put(
1394                 OrderedHashmap *h,
1395                 ChainCacheItem *ci,
1396                 uint64_t first,
1397                 uint64_t array,
1398                 uint64_t begin,
1399                 uint64_t total,
1400                 uint64_t last_index) {
1401
1402         if (!ci) {
1403                 /* If the chain item to cache for this chain is the
1404                  * first one it's not worth caching anything */
1405                 if (array == first)
1406                         return;
1407
1408                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1409                         ci = ordered_hashmap_steal_first(h);
1410                         assert(ci);
1411                 } else {
1412                         ci = new(ChainCacheItem, 1);
1413                         if (!ci)
1414                                 return;
1415                 }
1416
1417                 ci->first = first;
1418
1419                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1420                         free(ci);
1421                         return;
1422                 }
1423         } else
1424                 assert(ci->first == first);
1425
1426         ci->array = array;
1427         ci->begin = begin;
1428         ci->total = total;
1429         ci->last_index = last_index;
1430 }
1431
1432 static int generic_array_get(
1433                 JournalFile *f,
1434                 uint64_t first,
1435                 uint64_t i,
1436                 Object **ret, uint64_t *offset) {
1437
1438         Object *o;
1439         uint64_t p = 0, a, t = 0;
1440         int r;
1441         ChainCacheItem *ci;
1442
1443         assert(f);
1444
1445         a = first;
1446
1447         /* Try the chain cache first */
1448         ci = ordered_hashmap_get(f->chain_cache, &first);
1449         if (ci && i > ci->total) {
1450                 a = ci->array;
1451                 i -= ci->total;
1452                 t = ci->total;
1453         }
1454
1455         while (a > 0) {
1456                 uint64_t k;
1457
1458                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1459                 if (r < 0)
1460                         return r;
1461
1462                 k = journal_file_entry_array_n_items(o);
1463                 if (i < k) {
1464                         p = le64toh(o->entry_array.items[i]);
1465                         goto found;
1466                 }
1467
1468                 i -= k;
1469                 t += k;
1470                 a = le64toh(o->entry_array.next_entry_array_offset);
1471         }
1472
1473         return 0;
1474
1475 found:
1476         /* Let's cache this item for the next invocation */
1477         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1478
1479         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1480         if (r < 0)
1481                 return r;
1482
1483         if (ret)
1484                 *ret = o;
1485
1486         if (offset)
1487                 *offset = p;
1488
1489         return 1;
1490 }
1491
1492 static int generic_array_get_plus_one(
1493                 JournalFile *f,
1494                 uint64_t extra,
1495                 uint64_t first,
1496                 uint64_t i,
1497                 Object **ret, uint64_t *offset) {
1498
1499         Object *o;
1500
1501         assert(f);
1502
1503         if (i == 0) {
1504                 int r;
1505
1506                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1507                 if (r < 0)
1508                         return r;
1509
1510                 if (ret)
1511                         *ret = o;
1512
1513                 if (offset)
1514                         *offset = extra;
1515
1516                 return 1;
1517         }
1518
1519         return generic_array_get(f, first, i-1, ret, offset);
1520 }
1521
1522 enum {
1523         TEST_FOUND,
1524         TEST_LEFT,
1525         TEST_RIGHT
1526 };
1527
1528 static int generic_array_bisect(
1529                 JournalFile *f,
1530                 uint64_t first,
1531                 uint64_t n,
1532                 uint64_t needle,
1533                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1534                 direction_t direction,
1535                 Object **ret,
1536                 uint64_t *offset,
1537                 uint64_t *idx) {
1538
1539         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1540         bool subtract_one = false;
1541         Object *o, *array = NULL;
1542         int r;
1543         ChainCacheItem *ci;
1544
1545         assert(f);
1546         assert(test_object);
1547
1548         /* Start with the first array in the chain */
1549         a = first;
1550
1551         ci = ordered_hashmap_get(f->chain_cache, &first);
1552         if (ci && n > ci->total) {
1553                 /* Ah, we have iterated this bisection array chain
1554                  * previously! Let's see if we can skip ahead in the
1555                  * chain, as far as the last time. But we can't jump
1556                  * backwards in the chain, so let's check that
1557                  * first. */
1558
1559                 r = test_object(f, ci->begin, needle);
1560                 if (r < 0)
1561                         return r;
1562
1563                 if (r == TEST_LEFT) {
1564                         /* OK, what we are looking for is right of the
1565                          * begin of this EntryArray, so let's jump
1566                          * straight to previously cached array in the
1567                          * chain */
1568
1569                         a = ci->array;
1570                         n -= ci->total;
1571                         t = ci->total;
1572                         last_index = ci->last_index;
1573                 }
1574         }
1575
1576         while (a > 0) {
1577                 uint64_t left, right, k, lp;
1578
1579                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1580                 if (r < 0)
1581                         return r;
1582
1583                 k = journal_file_entry_array_n_items(array);
1584                 right = MIN(k, n);
1585                 if (right <= 0)
1586                         return 0;
1587
1588                 i = right - 1;
1589                 lp = p = le64toh(array->entry_array.items[i]);
1590                 if (p <= 0)
1591                         return -EBADMSG;
1592
1593                 r = test_object(f, p, needle);
1594                 if (r < 0)
1595                         return r;
1596
1597                 if (r == TEST_FOUND)
1598                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1599
1600                 if (r == TEST_RIGHT) {
1601                         left = 0;
1602                         right -= 1;
1603
1604                         if (last_index != (uint64_t) -1) {
1605                                 assert(last_index <= right);
1606
1607                                 /* If we cached the last index we
1608                                  * looked at, let's try to not to jump
1609                                  * too wildly around and see if we can
1610                                  * limit the range to look at early to
1611                                  * the immediate neighbors of the last
1612                                  * index we looked at. */
1613
1614                                 if (last_index > 0) {
1615                                         uint64_t x = last_index - 1;
1616
1617                                         p = le64toh(array->entry_array.items[x]);
1618                                         if (p <= 0)
1619                                                 return -EBADMSG;
1620
1621                                         r = test_object(f, p, needle);
1622                                         if (r < 0)
1623                                                 return r;
1624
1625                                         if (r == TEST_FOUND)
1626                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1627
1628                                         if (r == TEST_RIGHT)
1629                                                 right = x;
1630                                         else
1631                                                 left = x + 1;
1632                                 }
1633
1634                                 if (last_index < right) {
1635                                         uint64_t y = last_index + 1;
1636
1637                                         p = le64toh(array->entry_array.items[y]);
1638                                         if (p <= 0)
1639                                                 return -EBADMSG;
1640
1641                                         r = test_object(f, p, needle);
1642                                         if (r < 0)
1643                                                 return r;
1644
1645                                         if (r == TEST_FOUND)
1646                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1647
1648                                         if (r == TEST_RIGHT)
1649                                                 right = y;
1650                                         else
1651                                                 left = y + 1;
1652                                 }
1653                         }
1654
1655                         for (;;) {
1656                                 if (left == right) {
1657                                         if (direction == DIRECTION_UP)
1658                                                 subtract_one = true;
1659
1660                                         i = left;
1661                                         goto found;
1662                                 }
1663
1664                                 assert(left < right);
1665                                 i = (left + right) / 2;
1666
1667                                 p = le64toh(array->entry_array.items[i]);
1668                                 if (p <= 0)
1669                                         return -EBADMSG;
1670
1671                                 r = test_object(f, p, needle);
1672                                 if (r < 0)
1673                                         return r;
1674
1675                                 if (r == TEST_FOUND)
1676                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1677
1678                                 if (r == TEST_RIGHT)
1679                                         right = i;
1680                                 else
1681                                         left = i + 1;
1682                         }
1683                 }
1684
1685                 if (k >= n) {
1686                         if (direction == DIRECTION_UP) {
1687                                 i = n;
1688                                 subtract_one = true;
1689                                 goto found;
1690                         }
1691
1692                         return 0;
1693                 }
1694
1695                 last_p = lp;
1696
1697                 n -= k;
1698                 t += k;
1699                 last_index = (uint64_t) -1;
1700                 a = le64toh(array->entry_array.next_entry_array_offset);
1701         }
1702
1703         return 0;
1704
1705 found:
1706         if (subtract_one && t == 0 && i == 0)
1707                 return 0;
1708
1709         /* Let's cache this item for the next invocation */
1710         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1711
1712         if (subtract_one && i == 0)
1713                 p = last_p;
1714         else if (subtract_one)
1715                 p = le64toh(array->entry_array.items[i-1]);
1716         else
1717                 p = le64toh(array->entry_array.items[i]);
1718
1719         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1720         if (r < 0)
1721                 return r;
1722
1723         if (ret)
1724                 *ret = o;
1725
1726         if (offset)
1727                 *offset = p;
1728
1729         if (idx)
1730                 *idx = t + i + (subtract_one ? -1 : 0);
1731
1732         return 1;
1733 }
1734
1735 static int generic_array_bisect_plus_one(
1736                 JournalFile *f,
1737                 uint64_t extra,
1738                 uint64_t first,
1739                 uint64_t n,
1740                 uint64_t needle,
1741                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1742                 direction_t direction,
1743                 Object **ret,
1744                 uint64_t *offset,
1745                 uint64_t *idx) {
1746
1747         int r;
1748         bool step_back = false;
1749         Object *o;
1750
1751         assert(f);
1752         assert(test_object);
1753
1754         if (n <= 0)
1755                 return 0;
1756
1757         /* This bisects the array in object 'first', but first checks
1758          * an extra  */
1759         r = test_object(f, extra, needle);
1760         if (r < 0)
1761                 return r;
1762
1763         if (r == TEST_FOUND)
1764                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1765
1766         /* if we are looking with DIRECTION_UP then we need to first
1767            see if in the actual array there is a matching entry, and
1768            return the last one of that. But if there isn't any we need
1769            to return this one. Hence remember this, and return it
1770            below. */
1771         if (r == TEST_LEFT)
1772                 step_back = direction == DIRECTION_UP;
1773
1774         if (r == TEST_RIGHT) {
1775                 if (direction == DIRECTION_DOWN)
1776                         goto found;
1777                 else
1778                         return 0;
1779         }
1780
1781         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1782
1783         if (r == 0 && step_back)
1784                 goto found;
1785
1786         if (r > 0 && idx)
1787                 (*idx) ++;
1788
1789         return r;
1790
1791 found:
1792         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1793         if (r < 0)
1794                 return r;
1795
1796         if (ret)
1797                 *ret = o;
1798
1799         if (offset)
1800                 *offset = extra;
1801
1802         if (idx)
1803                 *idx = 0;
1804
1805         return 1;
1806 }
1807
1808 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1809         assert(f);
1810         assert(p > 0);
1811
1812         if (p == needle)
1813                 return TEST_FOUND;
1814         else if (p < needle)
1815                 return TEST_LEFT;
1816         else
1817                 return TEST_RIGHT;
1818 }
1819
1820 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1821         Object *o;
1822         int r;
1823
1824         assert(f);
1825         assert(p > 0);
1826
1827         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1828         if (r < 0)
1829                 return r;
1830
1831         if (le64toh(o->entry.seqnum) == needle)
1832                 return TEST_FOUND;
1833         else if (le64toh(o->entry.seqnum) < needle)
1834                 return TEST_LEFT;
1835         else
1836                 return TEST_RIGHT;
1837 }
1838
1839 int journal_file_move_to_entry_by_seqnum(
1840                 JournalFile *f,
1841                 uint64_t seqnum,
1842                 direction_t direction,
1843                 Object **ret,
1844                 uint64_t *offset) {
1845
1846         return generic_array_bisect(f,
1847                                     le64toh(f->header->entry_array_offset),
1848                                     le64toh(f->header->n_entries),
1849                                     seqnum,
1850                                     test_object_seqnum,
1851                                     direction,
1852                                     ret, offset, NULL);
1853 }
1854
1855 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1856         Object *o;
1857         int r;
1858
1859         assert(f);
1860         assert(p > 0);
1861
1862         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1863         if (r < 0)
1864                 return r;
1865
1866         if (le64toh(o->entry.realtime) == needle)
1867                 return TEST_FOUND;
1868         else if (le64toh(o->entry.realtime) < needle)
1869                 return TEST_LEFT;
1870         else
1871                 return TEST_RIGHT;
1872 }
1873
1874 int journal_file_move_to_entry_by_realtime(
1875                 JournalFile *f,
1876                 uint64_t realtime,
1877                 direction_t direction,
1878                 Object **ret,
1879                 uint64_t *offset) {
1880
1881         return generic_array_bisect(f,
1882                                     le64toh(f->header->entry_array_offset),
1883                                     le64toh(f->header->n_entries),
1884                                     realtime,
1885                                     test_object_realtime,
1886                                     direction,
1887                                     ret, offset, NULL);
1888 }
1889
1890 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1891         Object *o;
1892         int r;
1893
1894         assert(f);
1895         assert(p > 0);
1896
1897         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1898         if (r < 0)
1899                 return r;
1900
1901         if (le64toh(o->entry.monotonic) == needle)
1902                 return TEST_FOUND;
1903         else if (le64toh(o->entry.monotonic) < needle)
1904                 return TEST_LEFT;
1905         else
1906                 return TEST_RIGHT;
1907 }
1908
1909 static inline int find_data_object_by_boot_id(
1910                 JournalFile *f,
1911                 sd_id128_t boot_id,
1912                 Object **o,
1913                 uint64_t *b) {
1914         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1915
1916         sd_id128_to_string(boot_id, t + 9);
1917         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1918 }
1919
1920 int journal_file_move_to_entry_by_monotonic(
1921                 JournalFile *f,
1922                 sd_id128_t boot_id,
1923                 uint64_t monotonic,
1924                 direction_t direction,
1925                 Object **ret,
1926                 uint64_t *offset) {
1927
1928         Object *o;
1929         int r;
1930
1931         assert(f);
1932
1933         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1934         if (r < 0)
1935                 return r;
1936         if (r == 0)
1937                 return -ENOENT;
1938
1939         return generic_array_bisect_plus_one(f,
1940                                              le64toh(o->data.entry_offset),
1941                                              le64toh(o->data.entry_array_offset),
1942                                              le64toh(o->data.n_entries),
1943                                              monotonic,
1944                                              test_object_monotonic,
1945                                              direction,
1946                                              ret, offset, NULL);
1947 }
1948
1949 void journal_file_reset_location(JournalFile *f) {
1950         f->location_type = LOCATION_HEAD;
1951         f->current_offset = 0;
1952         f->current_seqnum = 0;
1953         f->current_realtime = 0;
1954         f->current_monotonic = 0;
1955         zero(f->current_boot_id);
1956         f->current_xor_hash = 0;
1957 }
1958
1959 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1960         f->last_direction = direction;
1961         f->location_type = LOCATION_SEEK;
1962         f->current_offset = offset;
1963         f->current_seqnum = le64toh(o->entry.seqnum);
1964         f->current_realtime = le64toh(o->entry.realtime);
1965         f->current_monotonic = le64toh(o->entry.monotonic);
1966         f->current_boot_id = o->entry.boot_id;
1967         f->current_xor_hash = le64toh(o->entry.xor_hash);
1968 }
1969
1970 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1971         assert(af);
1972         assert(bf);
1973         assert(af->location_type == LOCATION_SEEK);
1974         assert(bf->location_type == LOCATION_SEEK);
1975
1976         /* If contents and timestamps match, these entries are
1977          * identical, even if the seqnum does not match */
1978         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1979             af->current_monotonic == bf->current_monotonic &&
1980             af->current_realtime == bf->current_realtime &&
1981             af->current_xor_hash == bf->current_xor_hash)
1982                 return 0;
1983
1984         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1985
1986                 /* If this is from the same seqnum source, compare
1987                  * seqnums */
1988                 if (af->current_seqnum < bf->current_seqnum)
1989                         return -1;
1990                 if (af->current_seqnum > bf->current_seqnum)
1991                         return 1;
1992
1993                 /* Wow! This is weird, different data but the same
1994                  * seqnums? Something is borked, but let's make the
1995                  * best of it and compare by time. */
1996         }
1997
1998         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
1999
2000                 /* If the boot id matches, compare monotonic time */
2001                 if (af->current_monotonic < bf->current_monotonic)
2002                         return -1;
2003                 if (af->current_monotonic > bf->current_monotonic)
2004                         return 1;
2005         }
2006
2007         /* Otherwise, compare UTC time */
2008         if (af->current_realtime < bf->current_realtime)
2009                 return -1;
2010         if (af->current_realtime > bf->current_realtime)
2011                 return 1;
2012
2013         /* Finally, compare by contents */
2014         if (af->current_xor_hash < bf->current_xor_hash)
2015                 return -1;
2016         if (af->current_xor_hash > bf->current_xor_hash)
2017                 return 1;
2018
2019         return 0;
2020 }
2021
2022 int journal_file_next_entry(
2023                 JournalFile *f,
2024                 uint64_t p,
2025                 direction_t direction,
2026                 Object **ret, uint64_t *offset) {
2027
2028         uint64_t i, n, ofs;
2029         int r;
2030
2031         assert(f);
2032
2033         n = le64toh(f->header->n_entries);
2034         if (n <= 0)
2035                 return 0;
2036
2037         if (p == 0)
2038                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2039         else {
2040                 r = generic_array_bisect(f,
2041                                          le64toh(f->header->entry_array_offset),
2042                                          le64toh(f->header->n_entries),
2043                                          p,
2044                                          test_object_offset,
2045                                          DIRECTION_DOWN,
2046                                          NULL, NULL,
2047                                          &i);
2048                 if (r <= 0)
2049                         return r;
2050
2051                 if (direction == DIRECTION_DOWN) {
2052                         if (i >= n - 1)
2053                                 return 0;
2054
2055                         i++;
2056                 } else {
2057                         if (i <= 0)
2058                                 return 0;
2059
2060                         i--;
2061                 }
2062         }
2063
2064         /* And jump to it */
2065         r = generic_array_get(f,
2066                               le64toh(f->header->entry_array_offset),
2067                               i,
2068                               ret, &ofs);
2069         if (r <= 0)
2070                 return r;
2071
2072         if (p > 0 &&
2073             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2074                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2075                           f->path, i);
2076                 return -EBADMSG;
2077         }
2078
2079         if (offset)
2080                 *offset = ofs;
2081
2082         return 1;
2083 }
2084
2085 int journal_file_next_entry_for_data(
2086                 JournalFile *f,
2087                 Object *o, uint64_t p,
2088                 uint64_t data_offset,
2089                 direction_t direction,
2090                 Object **ret, uint64_t *offset) {
2091
2092         uint64_t n, i;
2093         int r;
2094         Object *d;
2095
2096         assert(f);
2097         assert(p > 0 || !o);
2098
2099         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2100         if (r < 0)
2101                 return r;
2102
2103         n = le64toh(d->data.n_entries);
2104         if (n <= 0)
2105                 return n;
2106
2107         if (!o)
2108                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2109         else {
2110                 if (o->object.type != OBJECT_ENTRY)
2111                         return -EINVAL;
2112
2113                 r = generic_array_bisect_plus_one(f,
2114                                                   le64toh(d->data.entry_offset),
2115                                                   le64toh(d->data.entry_array_offset),
2116                                                   le64toh(d->data.n_entries),
2117                                                   p,
2118                                                   test_object_offset,
2119                                                   DIRECTION_DOWN,
2120                                                   NULL, NULL,
2121                                                   &i);
2122
2123                 if (r <= 0)
2124                         return r;
2125
2126                 if (direction == DIRECTION_DOWN) {
2127                         if (i >= n - 1)
2128                                 return 0;
2129
2130                         i++;
2131                 } else {
2132                         if (i <= 0)
2133                                 return 0;
2134
2135                         i--;
2136                 }
2137
2138         }
2139
2140         return generic_array_get_plus_one(f,
2141                                           le64toh(d->data.entry_offset),
2142                                           le64toh(d->data.entry_array_offset),
2143                                           i,
2144                                           ret, offset);
2145 }
2146
2147 int journal_file_move_to_entry_by_offset_for_data(
2148                 JournalFile *f,
2149                 uint64_t data_offset,
2150                 uint64_t p,
2151                 direction_t direction,
2152                 Object **ret, uint64_t *offset) {
2153
2154         int r;
2155         Object *d;
2156
2157         assert(f);
2158
2159         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2160         if (r < 0)
2161                 return r;
2162
2163         return generic_array_bisect_plus_one(f,
2164                                              le64toh(d->data.entry_offset),
2165                                              le64toh(d->data.entry_array_offset),
2166                                              le64toh(d->data.n_entries),
2167                                              p,
2168                                              test_object_offset,
2169                                              direction,
2170                                              ret, offset, NULL);
2171 }
2172
2173 int journal_file_move_to_entry_by_monotonic_for_data(
2174                 JournalFile *f,
2175                 uint64_t data_offset,
2176                 sd_id128_t boot_id,
2177                 uint64_t monotonic,
2178                 direction_t direction,
2179                 Object **ret, uint64_t *offset) {
2180
2181         Object *o, *d;
2182         int r;
2183         uint64_t b, z;
2184
2185         assert(f);
2186
2187         /* First, seek by time */
2188         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2189         if (r < 0)
2190                 return r;
2191         if (r == 0)
2192                 return -ENOENT;
2193
2194         r = generic_array_bisect_plus_one(f,
2195                                           le64toh(o->data.entry_offset),
2196                                           le64toh(o->data.entry_array_offset),
2197                                           le64toh(o->data.n_entries),
2198                                           monotonic,
2199                                           test_object_monotonic,
2200                                           direction,
2201                                           NULL, &z, NULL);
2202         if (r <= 0)
2203                 return r;
2204
2205         /* And now, continue seeking until we find an entry that
2206          * exists in both bisection arrays */
2207
2208         for (;;) {
2209                 Object *qo;
2210                 uint64_t p, q;
2211
2212                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2213                 if (r < 0)
2214                         return r;
2215
2216                 r = generic_array_bisect_plus_one(f,
2217                                                   le64toh(d->data.entry_offset),
2218                                                   le64toh(d->data.entry_array_offset),
2219                                                   le64toh(d->data.n_entries),
2220                                                   z,
2221                                                   test_object_offset,
2222                                                   direction,
2223                                                   NULL, &p, NULL);
2224                 if (r <= 0)
2225                         return r;
2226
2227                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2228                 if (r < 0)
2229                         return r;
2230
2231                 r = generic_array_bisect_plus_one(f,
2232                                                   le64toh(o->data.entry_offset),
2233                                                   le64toh(o->data.entry_array_offset),
2234                                                   le64toh(o->data.n_entries),
2235                                                   p,
2236                                                   test_object_offset,
2237                                                   direction,
2238                                                   &qo, &q, NULL);
2239
2240                 if (r <= 0)
2241                         return r;
2242
2243                 if (p == q) {
2244                         if (ret)
2245                                 *ret = qo;
2246                         if (offset)
2247                                 *offset = q;
2248
2249                         return 1;
2250                 }
2251
2252                 z = q;
2253         }
2254 }
2255
2256 int journal_file_move_to_entry_by_seqnum_for_data(
2257                 JournalFile *f,
2258                 uint64_t data_offset,
2259                 uint64_t seqnum,
2260                 direction_t direction,
2261                 Object **ret, uint64_t *offset) {
2262
2263         Object *d;
2264         int r;
2265
2266         assert(f);
2267
2268         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2269         if (r < 0)
2270                 return r;
2271
2272         return generic_array_bisect_plus_one(f,
2273                                              le64toh(d->data.entry_offset),
2274                                              le64toh(d->data.entry_array_offset),
2275                                              le64toh(d->data.n_entries),
2276                                              seqnum,
2277                                              test_object_seqnum,
2278                                              direction,
2279                                              ret, offset, NULL);
2280 }
2281
2282 int journal_file_move_to_entry_by_realtime_for_data(
2283                 JournalFile *f,
2284                 uint64_t data_offset,
2285                 uint64_t realtime,
2286                 direction_t direction,
2287                 Object **ret, uint64_t *offset) {
2288
2289         Object *d;
2290         int r;
2291
2292         assert(f);
2293
2294         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2295         if (r < 0)
2296                 return r;
2297
2298         return generic_array_bisect_plus_one(f,
2299                                              le64toh(d->data.entry_offset),
2300                                              le64toh(d->data.entry_array_offset),
2301                                              le64toh(d->data.n_entries),
2302                                              realtime,
2303                                              test_object_realtime,
2304                                              direction,
2305                                              ret, offset, NULL);
2306 }
2307
2308 void journal_file_dump(JournalFile *f) {
2309         Object *o;
2310         int r;
2311         uint64_t p;
2312
2313         assert(f);
2314
2315         journal_file_print_header(f);
2316
2317         p = le64toh(f->header->header_size);
2318         while (p != 0) {
2319                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2320                 if (r < 0)
2321                         goto fail;
2322
2323                 switch (o->object.type) {
2324
2325                 case OBJECT_UNUSED:
2326                         printf("Type: OBJECT_UNUSED\n");
2327                         break;
2328
2329                 case OBJECT_DATA:
2330                         printf("Type: OBJECT_DATA\n");
2331                         break;
2332
2333                 case OBJECT_FIELD:
2334                         printf("Type: OBJECT_FIELD\n");
2335                         break;
2336
2337                 case OBJECT_ENTRY:
2338                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2339                                le64toh(o->entry.seqnum),
2340                                le64toh(o->entry.monotonic),
2341                                le64toh(o->entry.realtime));
2342                         break;
2343
2344                 case OBJECT_FIELD_HASH_TABLE:
2345                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2346                         break;
2347
2348                 case OBJECT_DATA_HASH_TABLE:
2349                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2350                         break;
2351
2352                 case OBJECT_ENTRY_ARRAY:
2353                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2354                         break;
2355
2356                 case OBJECT_TAG:
2357                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2358                                le64toh(o->tag.seqnum),
2359                                le64toh(o->tag.epoch));
2360                         break;
2361
2362                 default:
2363                         printf("Type: unknown (%u)\n", o->object.type);
2364                         break;
2365                 }
2366
2367                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2368                         printf("Flags: %s\n",
2369                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2370
2371                 if (p == le64toh(f->header->tail_object_offset))
2372                         p = 0;
2373                 else
2374                         p = p + ALIGN64(le64toh(o->object.size));
2375         }
2376
2377         return;
2378 fail:
2379         log_error("File corrupt");
2380 }
2381
2382 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2383         const char *x;
2384
2385         x = format_timestamp(buf, l, t);
2386         if (x)
2387                 return x;
2388         return " --- ";
2389 }
2390
2391 void journal_file_print_header(JournalFile *f) {
2392         char a[33], b[33], c[33], d[33];
2393         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2394         struct stat st;
2395         char bytes[FORMAT_BYTES_MAX];
2396
2397         assert(f);
2398
2399         printf("File Path: %s\n"
2400                "File ID: %s\n"
2401                "Machine ID: %s\n"
2402                "Boot ID: %s\n"
2403                "Sequential Number ID: %s\n"
2404                "State: %s\n"
2405                "Compatible Flags:%s%s\n"
2406                "Incompatible Flags:%s%s%s\n"
2407                "Header size: %"PRIu64"\n"
2408                "Arena size: %"PRIu64"\n"
2409                "Data Hash Table Size: %"PRIu64"\n"
2410                "Field Hash Table Size: %"PRIu64"\n"
2411                "Rotate Suggested: %s\n"
2412                "Head Sequential Number: %"PRIu64"\n"
2413                "Tail Sequential Number: %"PRIu64"\n"
2414                "Head Realtime Timestamp: %s\n"
2415                "Tail Realtime Timestamp: %s\n"
2416                "Tail Monotonic Timestamp: %s\n"
2417                "Objects: %"PRIu64"\n"
2418                "Entry Objects: %"PRIu64"\n",
2419                f->path,
2420                sd_id128_to_string(f->header->file_id, a),
2421                sd_id128_to_string(f->header->machine_id, b),
2422                sd_id128_to_string(f->header->boot_id, c),
2423                sd_id128_to_string(f->header->seqnum_id, d),
2424                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2425                f->header->state == STATE_ONLINE ? "ONLINE" :
2426                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2427                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2428                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2429                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2430                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2431                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2432                le64toh(f->header->header_size),
2433                le64toh(f->header->arena_size),
2434                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2435                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2436                yes_no(journal_file_rotate_suggested(f, 0)),
2437                le64toh(f->header->head_entry_seqnum),
2438                le64toh(f->header->tail_entry_seqnum),
2439                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2440                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2441                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2442                le64toh(f->header->n_objects),
2443                le64toh(f->header->n_entries));
2444
2445         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2446                 printf("Data Objects: %"PRIu64"\n"
2447                        "Data Hash Table Fill: %.1f%%\n",
2448                        le64toh(f->header->n_data),
2449                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2450
2451         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2452                 printf("Field Objects: %"PRIu64"\n"
2453                        "Field Hash Table Fill: %.1f%%\n",
2454                        le64toh(f->header->n_fields),
2455                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2456
2457         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2458                 printf("Tag Objects: %"PRIu64"\n",
2459                        le64toh(f->header->n_tags));
2460         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2461                 printf("Entry Array Objects: %"PRIu64"\n",
2462                        le64toh(f->header->n_entry_arrays));
2463
2464         if (fstat(f->fd, &st) >= 0)
2465                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2466 }
2467
2468 int journal_file_open(
2469                 const char *fname,
2470                 int flags,
2471                 mode_t mode,
2472                 bool compress,
2473                 bool seal,
2474                 JournalMetrics *metrics,
2475                 MMapCache *mmap_cache,
2476                 JournalFile *template,
2477                 JournalFile **ret) {
2478
2479         bool newly_created = false;
2480         JournalFile *f;
2481         void *h;
2482         int r;
2483
2484         assert(fname);
2485         assert(ret);
2486
2487         if ((flags & O_ACCMODE) != O_RDONLY &&
2488             (flags & O_ACCMODE) != O_RDWR)
2489                 return -EINVAL;
2490
2491         if (!endswith(fname, ".journal") &&
2492             !endswith(fname, ".journal~"))
2493                 return -EINVAL;
2494
2495         f = new0(JournalFile, 1);
2496         if (!f)
2497                 return -ENOMEM;
2498
2499         f->fd = -1;
2500         f->mode = mode;
2501
2502         f->flags = flags;
2503         f->prot = prot_from_flags(flags);
2504         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2505 #if defined(HAVE_LZ4)
2506         f->compress_lz4 = compress;
2507 #elif defined(HAVE_XZ)
2508         f->compress_xz = compress;
2509 #endif
2510 #ifdef HAVE_GCRYPT
2511         f->seal = seal;
2512 #endif
2513
2514         if (mmap_cache)
2515                 f->mmap = mmap_cache_ref(mmap_cache);
2516         else {
2517                 f->mmap = mmap_cache_new();
2518                 if (!f->mmap) {
2519                         r = -ENOMEM;
2520                         goto fail;
2521                 }
2522         }
2523
2524         f->path = strdup(fname);
2525         if (!f->path) {
2526                 r = -ENOMEM;
2527                 goto fail;
2528         }
2529
2530         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2531         if (!f->chain_cache) {
2532                 r = -ENOMEM;
2533                 goto fail;
2534         }
2535
2536         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2537         if (f->fd < 0) {
2538                 r = -errno;
2539                 goto fail;
2540         }
2541
2542         if (fstat(f->fd, &f->last_stat) < 0) {
2543                 r = -errno;
2544                 goto fail;
2545         }
2546
2547         if (f->last_stat.st_size == 0 && f->writable) {
2548                 /* Let's attach the creation time to the journal file,
2549                  * so that the vacuuming code knows the age of this
2550                  * file even if the file might end up corrupted one
2551                  * day... Ideally we'd just use the creation time many
2552                  * file systems maintain for each file, but there is
2553                  * currently no usable API to query this, hence let's
2554                  * emulate this via extended attributes. If extended
2555                  * attributes are not supported we'll just skip this,
2556                  * and rely solely on mtime/atime/ctime of the file. */
2557
2558                 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2559
2560 #ifdef HAVE_GCRYPT
2561                 /* Try to load the FSPRG state, and if we can't, then
2562                  * just don't do sealing */
2563                 if (f->seal) {
2564                         r = journal_file_fss_load(f);
2565                         if (r < 0)
2566                                 f->seal = false;
2567                 }
2568 #endif
2569
2570                 r = journal_file_init_header(f, template);
2571                 if (r < 0)
2572                         goto fail;
2573
2574                 if (fstat(f->fd, &f->last_stat) < 0) {
2575                         r = -errno;
2576                         goto fail;
2577                 }
2578
2579                 newly_created = true;
2580         }
2581
2582         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2583                 r = -EIO;
2584                 goto fail;
2585         }
2586
2587         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2588         if (r < 0) {
2589                 r = -errno;
2590                 goto fail;
2591         }
2592
2593         f->header = h;
2594
2595         if (!newly_created) {
2596                 r = journal_file_verify_header(f);
2597                 if (r < 0)
2598                         goto fail;
2599         }
2600
2601 #ifdef HAVE_GCRYPT
2602         if (!newly_created && f->writable) {
2603                 r = journal_file_fss_load(f);
2604                 if (r < 0)
2605                         goto fail;
2606         }
2607 #endif
2608
2609         if (f->writable) {
2610                 if (metrics) {
2611                         journal_default_metrics(metrics, f->fd);
2612                         f->metrics = *metrics;
2613                 } else if (template)
2614                         f->metrics = template->metrics;
2615
2616                 r = journal_file_refresh_header(f);
2617                 if (r < 0)
2618                         goto fail;
2619         }
2620
2621 #ifdef HAVE_GCRYPT
2622         r = journal_file_hmac_setup(f);
2623         if (r < 0)
2624                 goto fail;
2625 #endif
2626
2627         if (newly_created) {
2628                 r = journal_file_setup_field_hash_table(f);
2629                 if (r < 0)
2630                         goto fail;
2631
2632                 r = journal_file_setup_data_hash_table(f);
2633                 if (r < 0)
2634                         goto fail;
2635
2636 #ifdef HAVE_GCRYPT
2637                 r = journal_file_append_first_tag(f);
2638                 if (r < 0)
2639                         goto fail;
2640 #endif
2641         }
2642
2643         r = journal_file_map_field_hash_table(f);
2644         if (r < 0)
2645                 goto fail;
2646
2647         r = journal_file_map_data_hash_table(f);
2648         if (r < 0)
2649                 goto fail;
2650
2651         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2652                 r = -EIO;
2653                 goto fail;
2654         }
2655
2656         *ret = f;
2657         return 0;
2658
2659 fail:
2660         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2661                 r = -EIO;
2662
2663         journal_file_close(f);
2664
2665         return r;
2666 }
2667
2668 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2669         _cleanup_free_ char *p = NULL;
2670         size_t l;
2671         JournalFile *old_file, *new_file = NULL;
2672         int r;
2673
2674         assert(f);
2675         assert(*f);
2676
2677         old_file = *f;
2678
2679         if (!old_file->writable)
2680                 return -EINVAL;
2681
2682         if (!endswith(old_file->path, ".journal"))
2683                 return -EINVAL;
2684
2685         l = strlen(old_file->path);
2686         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2687                      (int) l - 8, old_file->path,
2688                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2689                      le64toh((*f)->header->head_entry_seqnum),
2690                      le64toh((*f)->header->head_entry_realtime));
2691         if (r < 0)
2692                 return -ENOMEM;
2693
2694         r = rename(old_file->path, p);
2695         if (r < 0)
2696                 return -errno;
2697
2698         old_file->header->state = STATE_ARCHIVED;
2699
2700         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2701         journal_file_close(old_file);
2702
2703         *f = new_file;
2704         return r;
2705 }
2706
2707 int journal_file_open_reliably(
2708                 const char *fname,
2709                 int flags,
2710                 mode_t mode,
2711                 bool compress,
2712                 bool seal,
2713                 JournalMetrics *metrics,
2714                 MMapCache *mmap_cache,
2715                 JournalFile *template,
2716                 JournalFile **ret) {
2717
2718         int r;
2719         size_t l;
2720         _cleanup_free_ char *p = NULL;
2721
2722         r = journal_file_open(fname, flags, mode, compress, seal,
2723                               metrics, mmap_cache, template, ret);
2724         if (r != -EBADMSG && /* corrupted */
2725             r != -ENODATA && /* truncated */
2726             r != -EHOSTDOWN && /* other machine */
2727             r != -EPROTONOSUPPORT && /* incompatible feature */
2728             r != -EBUSY && /* unclean shutdown */
2729             r != -ESHUTDOWN && /* already archived */
2730             r != -EIO /* IO error, including SIGBUS on mmap */)
2731                 return r;
2732
2733         if ((flags & O_ACCMODE) == O_RDONLY)
2734                 return r;
2735
2736         if (!(flags & O_CREAT))
2737                 return r;
2738
2739         if (!endswith(fname, ".journal"))
2740                 return r;
2741
2742         /* The file is corrupted. Rotate it away and try it again (but only once) */
2743
2744         l = strlen(fname);
2745         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2746                      (int) l - 8, fname,
2747                      (unsigned long long) now(CLOCK_REALTIME),
2748                      random_u64()) < 0)
2749                 return -ENOMEM;
2750
2751         r = rename(fname, p);
2752         if (r < 0)
2753                 return -errno;
2754
2755         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2756
2757         return journal_file_open(fname, flags, mode, compress, seal,
2758                                  metrics, mmap_cache, template, ret);
2759 }
2760
2761 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2762         uint64_t i, n;
2763         uint64_t q, xor_hash = 0;
2764         int r;
2765         EntryItem *items;
2766         dual_timestamp ts;
2767
2768         assert(from);
2769         assert(to);
2770         assert(o);
2771         assert(p);
2772
2773         if (!to->writable)
2774                 return -EPERM;
2775
2776         ts.monotonic = le64toh(o->entry.monotonic);
2777         ts.realtime = le64toh(o->entry.realtime);
2778
2779         n = journal_file_entry_n_items(o);
2780         /* alloca() can't take 0, hence let's allocate at least one */
2781         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2782
2783         for (i = 0; i < n; i++) {
2784                 uint64_t l, h;
2785                 le64_t le_hash;
2786                 size_t t;
2787                 void *data;
2788                 Object *u;
2789
2790                 q = le64toh(o->entry.items[i].object_offset);
2791                 le_hash = o->entry.items[i].hash;
2792
2793                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2794                 if (r < 0)
2795                         return r;
2796
2797                 if (le_hash != o->data.hash)
2798                         return -EBADMSG;
2799
2800                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2801                 t = (size_t) l;
2802
2803                 /* We hit the limit on 32bit machines */
2804                 if ((uint64_t) t != l)
2805                         return -E2BIG;
2806
2807                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2808 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2809                         size_t rsize;
2810
2811                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2812                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2813                         if (r < 0)
2814                                 return r;
2815
2816                         data = from->compress_buffer;
2817                         l = rsize;
2818 #else
2819                         return -EPROTONOSUPPORT;
2820 #endif
2821                 } else
2822                         data = o->data.payload;
2823
2824                 r = journal_file_append_data(to, data, l, &u, &h);
2825                 if (r < 0)
2826                         return r;
2827
2828                 xor_hash ^= le64toh(u->data.hash);
2829                 items[i].object_offset = htole64(h);
2830                 items[i].hash = u->data.hash;
2831
2832                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2833                 if (r < 0)
2834                         return r;
2835         }
2836
2837         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2838
2839         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2840                 return -EIO;
2841
2842         return r;
2843 }
2844
2845 void journal_default_metrics(JournalMetrics *m, int fd) {
2846         uint64_t fs_size = 0;
2847         struct statvfs ss;
2848         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2849
2850         assert(m);
2851         assert(fd >= 0);
2852
2853         if (fstatvfs(fd, &ss) >= 0)
2854                 fs_size = ss.f_frsize * ss.f_blocks;
2855
2856         if (m->max_use == (uint64_t) -1) {
2857
2858                 if (fs_size > 0) {
2859                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2860
2861                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2862                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2863
2864                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2865                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2866                 } else
2867                         m->max_use = DEFAULT_MAX_USE_LOWER;
2868         } else {
2869                 m->max_use = PAGE_ALIGN(m->max_use);
2870
2871                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2872                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2873         }
2874
2875         if (m->max_size == (uint64_t) -1) {
2876                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2877
2878                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2879                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2880         } else
2881                 m->max_size = PAGE_ALIGN(m->max_size);
2882
2883         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2884                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2885
2886         if (m->max_size*2 > m->max_use)
2887                 m->max_use = m->max_size*2;
2888
2889         if (m->min_size == (uint64_t) -1)
2890                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2891         else {
2892                 m->min_size = PAGE_ALIGN(m->min_size);
2893
2894                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2895                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2896
2897                 if (m->min_size > m->max_size)
2898                         m->max_size = m->min_size;
2899         }
2900
2901         if (m->keep_free == (uint64_t) -1) {
2902
2903                 if (fs_size > 0) {
2904                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2905
2906                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2907                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2908
2909                 } else
2910                         m->keep_free = DEFAULT_KEEP_FREE;
2911         }
2912
2913         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2914                   format_bytes(a, sizeof(a), m->max_use),
2915                   format_bytes(b, sizeof(b), m->max_size),
2916                   format_bytes(c, sizeof(c), m->min_size),
2917                   format_bytes(d, sizeof(d), m->keep_free));
2918 }
2919
2920 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2921         assert(f);
2922         assert(from || to);
2923
2924         if (from) {
2925                 if (f->header->head_entry_realtime == 0)
2926                         return -ENOENT;
2927
2928                 *from = le64toh(f->header->head_entry_realtime);
2929         }
2930
2931         if (to) {
2932                 if (f->header->tail_entry_realtime == 0)
2933                         return -ENOENT;
2934
2935                 *to = le64toh(f->header->tail_entry_realtime);
2936         }
2937
2938         return 1;
2939 }
2940
2941 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2942         Object *o;
2943         uint64_t p;
2944         int r;
2945
2946         assert(f);
2947         assert(from || to);
2948
2949         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2950         if (r <= 0)
2951                 return r;
2952
2953         if (le64toh(o->data.n_entries) <= 0)
2954                 return 0;
2955
2956         if (from) {
2957                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2958                 if (r < 0)
2959                         return r;
2960
2961                 *from = le64toh(o->entry.monotonic);
2962         }
2963
2964         if (to) {
2965                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2966                 if (r < 0)
2967                         return r;
2968
2969                 r = generic_array_get_plus_one(f,
2970                                                le64toh(o->data.entry_offset),
2971                                                le64toh(o->data.entry_array_offset),
2972                                                le64toh(o->data.n_entries)-1,
2973                                                &o, NULL);
2974                 if (r <= 0)
2975                         return r;
2976
2977                 *to = le64toh(o->entry.monotonic);
2978         }
2979
2980         return 1;
2981 }
2982
2983 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2984         assert(f);
2985
2986         /* If we gained new header fields we gained new features,
2987          * hence suggest a rotation */
2988         if (le64toh(f->header->header_size) < sizeof(Header)) {
2989                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2990                 return true;
2991         }
2992
2993         /* Let's check if the hash tables grew over a certain fill
2994          * level (75%, borrowing this value from Java's hash table
2995          * implementation), and if so suggest a rotation. To calculate
2996          * the fill level we need the n_data field, which only exists
2997          * in newer versions. */
2998
2999         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3000                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3001                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3002                                   f->path,
3003                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3004                                   le64toh(f->header->n_data),
3005                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3006                                   (unsigned long long) f->last_stat.st_size,
3007                                   f->last_stat.st_size / le64toh(f->header->n_data));
3008                         return true;
3009                 }
3010
3011         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3012                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3013                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3014                                   f->path,
3015                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3016                                   le64toh(f->header->n_fields),
3017                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3018                         return true;
3019                 }
3020
3021         /* Are the data objects properly indexed by field objects? */
3022         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3023             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3024             le64toh(f->header->n_data) > 0 &&
3025             le64toh(f->header->n_fields) == 0)
3026                 return true;
3027
3028         if (max_file_usec > 0) {
3029                 usec_t t, h;
3030
3031                 h = le64toh(f->header->head_entry_realtime);
3032                 t = now(CLOCK_REALTIME);
3033
3034                 if (h > 0 && t > h + max_file_usec)
3035                         return true;
3036         }
3037
3038         return false;
3039 }