chiark / gitweb /
Fix some format strings for enums, they are signed
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <linux/fs.h>
30
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
35 #include "lookup3.h"
36 #include "compress.h"
37 #include "fsprg.h"
38
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41
42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
46
47 /* These are the lower and upper bounds if we deduce the max_use value
48  * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
51
52 /* This is the upper bound if we deduce max_size from max_use */
53 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
54
55 /* This is the upper bound if we deduce the keep_free value from the
56  * file system size */
57 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
59 /* This is the keep_free value when we can't determine the system
60  * size */
61 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
62
63 /* n_data was the first entry we added after the initial file format design */
64 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65
66 /* How many entries to keep in the entry array chain cache at max */
67 #define CHAIN_CACHE_MAX 20
68
69 /* How much to increase the journal file size at once each time we allocate something new. */
70 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
71
72 /* Reread fstat() of the file for detecting deletions at least this often */
73 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74
75 /* The mmap context to use for the header we pick as one above the last defined typed */
76 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
77
78 static int journal_file_set_online(JournalFile *f) {
79         assert(f);
80
81         if (!f->writable)
82                 return -EPERM;
83
84         if (!(f->fd >= 0 && f->header))
85                 return -EINVAL;
86
87         if (mmap_cache_got_sigbus(f->mmap, f->fd))
88                 return -EIO;
89
90         switch(f->header->state) {
91                 case STATE_ONLINE:
92                         return 0;
93
94                 case STATE_OFFLINE:
95                         f->header->state = STATE_ONLINE;
96                         fsync(f->fd);
97                         return 0;
98
99                 default:
100                         return -EINVAL;
101         }
102 }
103
104 int journal_file_set_offline(JournalFile *f) {
105         assert(f);
106
107         if (!f->writable)
108                 return -EPERM;
109
110         if (!(f->fd >= 0 && f->header))
111                 return -EINVAL;
112
113         if (f->header->state != STATE_ONLINE)
114                 return 0;
115
116         fsync(f->fd);
117
118         if (mmap_cache_got_sigbus(f->mmap, f->fd))
119                 return -EIO;
120
121         f->header->state = STATE_OFFLINE;
122
123         if (mmap_cache_got_sigbus(f->mmap, f->fd))
124                 return -EIO;
125
126         fsync(f->fd);
127
128         return 0;
129 }
130
131 void journal_file_close(JournalFile *f) {
132         assert(f);
133
134 #ifdef HAVE_GCRYPT
135         /* Write the final tag */
136         if (f->seal && f->writable)
137                 journal_file_append_tag(f);
138 #endif
139
140         journal_file_set_offline(f);
141
142         if (f->mmap && f->fd >= 0)
143                 mmap_cache_close_fd(f->mmap, f->fd);
144
145         if (f->fd >= 0 && f->defrag_on_close) {
146
147                 /* Be friendly to btrfs: turn COW back on again now,
148                  * and defragment the file. We won't write to the file
149                  * ever again, hence remove all fragmentation, and
150                  * reenable all the good bits COW usually provides
151                  * (such as data checksumming). */
152
153                 (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
154                 (void) btrfs_defrag_fd(f->fd);
155         }
156
157         safe_close(f->fd);
158         free(f->path);
159
160         if (f->mmap)
161                 mmap_cache_unref(f->mmap);
162
163         ordered_hashmap_free_free(f->chain_cache);
164
165 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
166         free(f->compress_buffer);
167 #endif
168
169 #ifdef HAVE_GCRYPT
170         if (f->fss_file)
171                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
172         else if (f->fsprg_state)
173                 free(f->fsprg_state);
174
175         free(f->fsprg_seed);
176
177         if (f->hmac)
178                 gcry_md_close(f->hmac);
179 #endif
180
181         free(f);
182 }
183
184 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
185         Header h = {};
186         ssize_t k;
187         int r;
188
189         assert(f);
190
191         memcpy(h.signature, HEADER_SIGNATURE, 8);
192         h.header_size = htole64(ALIGN64(sizeof(h)));
193
194         h.incompatible_flags |= htole32(
195                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
196                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
197
198         h.compatible_flags = htole32(
199                 f->seal * HEADER_COMPATIBLE_SEALED);
200
201         r = sd_id128_randomize(&h.file_id);
202         if (r < 0)
203                 return r;
204
205         if (template) {
206                 h.seqnum_id = template->header->seqnum_id;
207                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
208         } else
209                 h.seqnum_id = h.file_id;
210
211         k = pwrite(f->fd, &h, sizeof(h), 0);
212         if (k < 0)
213                 return -errno;
214
215         if (k != sizeof(h))
216                 return -EIO;
217
218         return 0;
219 }
220
221 static int journal_file_refresh_header(JournalFile *f) {
222         sd_id128_t boot_id;
223         int r;
224
225         assert(f);
226
227         r = sd_id128_get_machine(&f->header->machine_id);
228         if (r < 0)
229                 return r;
230
231         r = sd_id128_get_boot(&boot_id);
232         if (r < 0)
233                 return r;
234
235         if (sd_id128_equal(boot_id, f->header->boot_id))
236                 f->tail_entry_monotonic_valid = true;
237
238         f->header->boot_id = boot_id;
239
240         r = journal_file_set_online(f);
241
242         /* Sync the online state to disk */
243         fsync(f->fd);
244
245         return r;
246 }
247
248 static int journal_file_verify_header(JournalFile *f) {
249         uint32_t flags;
250
251         assert(f);
252
253         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
254                 return -EBADMSG;
255
256         /* In both read and write mode we refuse to open files with
257          * incompatible flags we don't know */
258         flags = le32toh(f->header->incompatible_flags);
259         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
260                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
261                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
262                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
263                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
264                 if (flags)
265                         log_debug("Journal file %s uses incompatible flags %"PRIx32
266                                   " disabled at compilation time.", f->path, flags);
267                 return -EPROTONOSUPPORT;
268         }
269
270         /* When open for writing we refuse to open files with
271          * compatible flags, too */
272         flags = le32toh(f->header->compatible_flags);
273         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
274                 if (flags & ~HEADER_COMPATIBLE_ANY)
275                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
276                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
277                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
278                 if (flags)
279                         log_debug("Journal file %s uses compatible flags %"PRIx32
280                                   " disabled at compilation time.", f->path, flags);
281                 return -EPROTONOSUPPORT;
282         }
283
284         if (f->header->state >= _STATE_MAX)
285                 return -EBADMSG;
286
287         /* The first addition was n_data, so check that we are at least this large */
288         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
289                 return -EBADMSG;
290
291         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
292                 return -EBADMSG;
293
294         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
295                 return -ENODATA;
296
297         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
298                 return -ENODATA;
299
300         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
301             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
302             !VALID64(le64toh(f->header->tail_object_offset)) ||
303             !VALID64(le64toh(f->header->entry_array_offset)))
304                 return -ENODATA;
305
306         if (f->writable) {
307                 uint8_t state;
308                 sd_id128_t machine_id;
309                 int r;
310
311                 r = sd_id128_get_machine(&machine_id);
312                 if (r < 0)
313                         return r;
314
315                 if (!sd_id128_equal(machine_id, f->header->machine_id))
316                         return -EHOSTDOWN;
317
318                 state = f->header->state;
319
320                 if (state == STATE_ONLINE) {
321                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
322                         return -EBUSY;
323                 } else if (state == STATE_ARCHIVED)
324                         return -ESHUTDOWN;
325                 else if (state != STATE_OFFLINE) {
326                         log_debug("Journal file %s has unknown state %i.", f->path, state);
327                         return -EBUSY;
328                 }
329         }
330
331         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
332         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
333
334         f->seal = JOURNAL_HEADER_SEALED(f->header);
335
336         return 0;
337 }
338
339 static int journal_file_fstat(JournalFile *f) {
340         assert(f);
341         assert(f->fd >= 0);
342
343         if (fstat(f->fd, &f->last_stat) < 0)
344                 return -errno;
345
346         f->last_stat_usec = now(CLOCK_MONOTONIC);
347
348         /* Refuse appending to files that are already deleted */
349         if (f->last_stat.st_nlink <= 0)
350                 return -EIDRM;
351
352         return 0;
353 }
354
355 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
356         uint64_t old_size, new_size;
357         int r;
358
359         assert(f);
360
361         /* We assume that this file is not sparse, and we know that
362          * for sure, since we always call posix_fallocate()
363          * ourselves */
364
365         if (mmap_cache_got_sigbus(f->mmap, f->fd))
366                 return -EIO;
367
368         old_size =
369                 le64toh(f->header->header_size) +
370                 le64toh(f->header->arena_size);
371
372         new_size = PAGE_ALIGN(offset + size);
373         if (new_size < le64toh(f->header->header_size))
374                 new_size = le64toh(f->header->header_size);
375
376         if (new_size <= old_size) {
377
378                 /* We already pre-allocated enough space, but before
379                  * we write to it, let's check with fstat() if the
380                  * file got deleted, in order make sure we don't throw
381                  * away the data immediately. Don't check fstat() for
382                  * all writes though, but only once ever 10s. */
383
384                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
385                         return 0;
386
387                 return journal_file_fstat(f);
388         }
389
390         /* Allocate more space. */
391
392         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
393                 return -E2BIG;
394
395         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
396                 struct statvfs svfs;
397
398                 if (fstatvfs(f->fd, &svfs) >= 0) {
399                         uint64_t available;
400
401                         available = svfs.f_bfree * svfs.f_bsize;
402
403                         if (available >= f->metrics.keep_free)
404                                 available -= f->metrics.keep_free;
405                         else
406                                 available = 0;
407
408                         if (new_size - old_size > available)
409                                 return -E2BIG;
410                 }
411         }
412
413         /* Increase by larger blocks at once */
414         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
415         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
416                 new_size = f->metrics.max_size;
417
418         /* Note that the glibc fallocate() fallback is very
419            inefficient, hence we try to minimize the allocation area
420            as we can. */
421         r = posix_fallocate(f->fd, old_size, new_size - old_size);
422         if (r != 0)
423                 return -r;
424
425         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
426
427         return journal_file_fstat(f);
428 }
429
430 static unsigned type_to_context(ObjectType type) {
431         /* One context for each type, plus one catch-all for the rest */
432         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
433         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
434         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
435 }
436
437 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
438         int r;
439
440         assert(f);
441         assert(ret);
442
443         if (size <= 0)
444                 return -EINVAL;
445
446         /* Avoid SIGBUS on invalid accesses */
447         if (offset + size > (uint64_t) f->last_stat.st_size) {
448                 /* Hmm, out of range? Let's refresh the fstat() data
449                  * first, before we trust that check. */
450
451                 r = journal_file_fstat(f);
452                 if (r < 0)
453                         return r;
454
455                 if (offset + size > (uint64_t) f->last_stat.st_size)
456                         return -EADDRNOTAVAIL;
457         }
458
459         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
460 }
461
462 static uint64_t minimum_header_size(Object *o) {
463
464         static const uint64_t table[] = {
465                 [OBJECT_DATA] = sizeof(DataObject),
466                 [OBJECT_FIELD] = sizeof(FieldObject),
467                 [OBJECT_ENTRY] = sizeof(EntryObject),
468                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
469                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
470                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
471                 [OBJECT_TAG] = sizeof(TagObject),
472         };
473
474         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
475                 return sizeof(ObjectHeader);
476
477         return table[o->object.type];
478 }
479
480 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
481         int r;
482         void *t;
483         Object *o;
484         uint64_t s;
485
486         assert(f);
487         assert(ret);
488
489         /* Objects may only be located at multiple of 64 bit */
490         if (!VALID64(offset))
491                 return -EFAULT;
492
493         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
494         if (r < 0)
495                 return r;
496
497         o = (Object*) t;
498         s = le64toh(o->object.size);
499
500         if (s < sizeof(ObjectHeader))
501                 return -EBADMSG;
502
503         if (o->object.type <= OBJECT_UNUSED)
504                 return -EBADMSG;
505
506         if (s < minimum_header_size(o))
507                 return -EBADMSG;
508
509         if (type > OBJECT_UNUSED && o->object.type != type)
510                 return -EBADMSG;
511
512         if (s > sizeof(ObjectHeader)) {
513                 r = journal_file_move_to(f, type, false, offset, s, &t);
514                 if (r < 0)
515                         return r;
516
517                 o = (Object*) t;
518         }
519
520         *ret = o;
521         return 0;
522 }
523
524 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
525         uint64_t r;
526
527         assert(f);
528
529         r = le64toh(f->header->tail_entry_seqnum) + 1;
530
531         if (seqnum) {
532                 /* If an external seqnum counter was passed, we update
533                  * both the local and the external one, and set it to
534                  * the maximum of both */
535
536                 if (*seqnum + 1 > r)
537                         r = *seqnum + 1;
538
539                 *seqnum = r;
540         }
541
542         f->header->tail_entry_seqnum = htole64(r);
543
544         if (f->header->head_entry_seqnum == 0)
545                 f->header->head_entry_seqnum = htole64(r);
546
547         return r;
548 }
549
550 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
551         int r;
552         uint64_t p;
553         Object *tail, *o;
554         void *t;
555
556         assert(f);
557         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
558         assert(size >= sizeof(ObjectHeader));
559         assert(offset);
560         assert(ret);
561
562         r = journal_file_set_online(f);
563         if (r < 0)
564                 return r;
565
566         p = le64toh(f->header->tail_object_offset);
567         if (p == 0)
568                 p = le64toh(f->header->header_size);
569         else {
570                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
571                 if (r < 0)
572                         return r;
573
574                 p += ALIGN64(le64toh(tail->object.size));
575         }
576
577         r = journal_file_allocate(f, p, size);
578         if (r < 0)
579                 return r;
580
581         r = journal_file_move_to(f, type, false, p, size, &t);
582         if (r < 0)
583                 return r;
584
585         o = (Object*) t;
586
587         zero(o->object);
588         o->object.type = type;
589         o->object.size = htole64(size);
590
591         f->header->tail_object_offset = htole64(p);
592         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
593
594         *ret = o;
595         *offset = p;
596
597         return 0;
598 }
599
600 static int journal_file_setup_data_hash_table(JournalFile *f) {
601         uint64_t s, p;
602         Object *o;
603         int r;
604
605         assert(f);
606
607         /* We estimate that we need 1 hash table entry per 768 of
608            journal file and we want to make sure we never get beyond
609            75% fill level. Calculate the hash table size for the
610            maximum file size based on these metrics. */
611
612         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
613         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
614                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
615
616         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
617
618         r = journal_file_append_object(f,
619                                        OBJECT_DATA_HASH_TABLE,
620                                        offsetof(Object, hash_table.items) + s,
621                                        &o, &p);
622         if (r < 0)
623                 return r;
624
625         memzero(o->hash_table.items, s);
626
627         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
628         f->header->data_hash_table_size = htole64(s);
629
630         return 0;
631 }
632
633 static int journal_file_setup_field_hash_table(JournalFile *f) {
634         uint64_t s, p;
635         Object *o;
636         int r;
637
638         assert(f);
639
640         /* We use a fixed size hash table for the fields as this
641          * number should grow very slowly only */
642
643         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
644         r = journal_file_append_object(f,
645                                        OBJECT_FIELD_HASH_TABLE,
646                                        offsetof(Object, hash_table.items) + s,
647                                        &o, &p);
648         if (r < 0)
649                 return r;
650
651         memzero(o->hash_table.items, s);
652
653         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
654         f->header->field_hash_table_size = htole64(s);
655
656         return 0;
657 }
658
659 static int journal_file_map_data_hash_table(JournalFile *f) {
660         uint64_t s, p;
661         void *t;
662         int r;
663
664         assert(f);
665
666         p = le64toh(f->header->data_hash_table_offset);
667         s = le64toh(f->header->data_hash_table_size);
668
669         r = journal_file_move_to(f,
670                                  OBJECT_DATA_HASH_TABLE,
671                                  true,
672                                  p, s,
673                                  &t);
674         if (r < 0)
675                 return r;
676
677         f->data_hash_table = t;
678         return 0;
679 }
680
681 static int journal_file_map_field_hash_table(JournalFile *f) {
682         uint64_t s, p;
683         void *t;
684         int r;
685
686         assert(f);
687
688         p = le64toh(f->header->field_hash_table_offset);
689         s = le64toh(f->header->field_hash_table_size);
690
691         r = journal_file_move_to(f,
692                                  OBJECT_FIELD_HASH_TABLE,
693                                  true,
694                                  p, s,
695                                  &t);
696         if (r < 0)
697                 return r;
698
699         f->field_hash_table = t;
700         return 0;
701 }
702
703 static int journal_file_link_field(
704                 JournalFile *f,
705                 Object *o,
706                 uint64_t offset,
707                 uint64_t hash) {
708
709         uint64_t p, h, m;
710         int r;
711
712         assert(f);
713         assert(o);
714         assert(offset > 0);
715
716         if (o->object.type != OBJECT_FIELD)
717                 return -EINVAL;
718
719         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
720         if (m <= 0)
721                 return -EBADMSG;
722
723         /* This might alter the window we are looking at */
724         o->field.next_hash_offset = o->field.head_data_offset = 0;
725
726         h = hash % m;
727         p = le64toh(f->field_hash_table[h].tail_hash_offset);
728         if (p == 0)
729                 f->field_hash_table[h].head_hash_offset = htole64(offset);
730         else {
731                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
732                 if (r < 0)
733                         return r;
734
735                 o->field.next_hash_offset = htole64(offset);
736         }
737
738         f->field_hash_table[h].tail_hash_offset = htole64(offset);
739
740         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
741                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
742
743         return 0;
744 }
745
746 static int journal_file_link_data(
747                 JournalFile *f,
748                 Object *o,
749                 uint64_t offset,
750                 uint64_t hash) {
751
752         uint64_t p, h, m;
753         int r;
754
755         assert(f);
756         assert(o);
757         assert(offset > 0);
758
759         if (o->object.type != OBJECT_DATA)
760                 return -EINVAL;
761
762         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
763         if (m <= 0)
764                 return -EBADMSG;
765
766         /* This might alter the window we are looking at */
767         o->data.next_hash_offset = o->data.next_field_offset = 0;
768         o->data.entry_offset = o->data.entry_array_offset = 0;
769         o->data.n_entries = 0;
770
771         h = hash % m;
772         p = le64toh(f->data_hash_table[h].tail_hash_offset);
773         if (p == 0)
774                 /* Only entry in the hash table is easy */
775                 f->data_hash_table[h].head_hash_offset = htole64(offset);
776         else {
777                 /* Move back to the previous data object, to patch in
778                  * pointer */
779
780                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
781                 if (r < 0)
782                         return r;
783
784                 o->data.next_hash_offset = htole64(offset);
785         }
786
787         f->data_hash_table[h].tail_hash_offset = htole64(offset);
788
789         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
790                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
791
792         return 0;
793 }
794
795 int journal_file_find_field_object_with_hash(
796                 JournalFile *f,
797                 const void *field, uint64_t size, uint64_t hash,
798                 Object **ret, uint64_t *offset) {
799
800         uint64_t p, osize, h, m;
801         int r;
802
803         assert(f);
804         assert(field && size > 0);
805
806         osize = offsetof(Object, field.payload) + size;
807
808         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
809
810         if (m <= 0)
811                 return -EBADMSG;
812
813         h = hash % m;
814         p = le64toh(f->field_hash_table[h].head_hash_offset);
815
816         while (p > 0) {
817                 Object *o;
818
819                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
820                 if (r < 0)
821                         return r;
822
823                 if (le64toh(o->field.hash) == hash &&
824                     le64toh(o->object.size) == osize &&
825                     memcmp(o->field.payload, field, size) == 0) {
826
827                         if (ret)
828                                 *ret = o;
829                         if (offset)
830                                 *offset = p;
831
832                         return 1;
833                 }
834
835                 p = le64toh(o->field.next_hash_offset);
836         }
837
838         return 0;
839 }
840
841 int journal_file_find_field_object(
842                 JournalFile *f,
843                 const void *field, uint64_t size,
844                 Object **ret, uint64_t *offset) {
845
846         uint64_t hash;
847
848         assert(f);
849         assert(field && size > 0);
850
851         hash = hash64(field, size);
852
853         return journal_file_find_field_object_with_hash(f,
854                                                         field, size, hash,
855                                                         ret, offset);
856 }
857
858 int journal_file_find_data_object_with_hash(
859                 JournalFile *f,
860                 const void *data, uint64_t size, uint64_t hash,
861                 Object **ret, uint64_t *offset) {
862
863         uint64_t p, osize, h, m;
864         int r;
865
866         assert(f);
867         assert(data || size == 0);
868
869         osize = offsetof(Object, data.payload) + size;
870
871         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
872         if (m <= 0)
873                 return -EBADMSG;
874
875         h = hash % m;
876         p = le64toh(f->data_hash_table[h].head_hash_offset);
877
878         while (p > 0) {
879                 Object *o;
880
881                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
882                 if (r < 0)
883                         return r;
884
885                 if (le64toh(o->data.hash) != hash)
886                         goto next;
887
888                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
889 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
890                         uint64_t l;
891                         size_t rsize;
892
893                         l = le64toh(o->object.size);
894                         if (l <= offsetof(Object, data.payload))
895                                 return -EBADMSG;
896
897                         l -= offsetof(Object, data.payload);
898
899                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
900                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
901                         if (r < 0)
902                                 return r;
903
904                         if (rsize == size &&
905                             memcmp(f->compress_buffer, data, size) == 0) {
906
907                                 if (ret)
908                                         *ret = o;
909
910                                 if (offset)
911                                         *offset = p;
912
913                                 return 1;
914                         }
915 #else
916                         return -EPROTONOSUPPORT;
917 #endif
918                 } else if (le64toh(o->object.size) == osize &&
919                            memcmp(o->data.payload, data, size) == 0) {
920
921                         if (ret)
922                                 *ret = o;
923
924                         if (offset)
925                                 *offset = p;
926
927                         return 1;
928                 }
929
930         next:
931                 p = le64toh(o->data.next_hash_offset);
932         }
933
934         return 0;
935 }
936
937 int journal_file_find_data_object(
938                 JournalFile *f,
939                 const void *data, uint64_t size,
940                 Object **ret, uint64_t *offset) {
941
942         uint64_t hash;
943
944         assert(f);
945         assert(data || size == 0);
946
947         hash = hash64(data, size);
948
949         return journal_file_find_data_object_with_hash(f,
950                                                        data, size, hash,
951                                                        ret, offset);
952 }
953
954 static int journal_file_append_field(
955                 JournalFile *f,
956                 const void *field, uint64_t size,
957                 Object **ret, uint64_t *offset) {
958
959         uint64_t hash, p;
960         uint64_t osize;
961         Object *o;
962         int r;
963
964         assert(f);
965         assert(field && size > 0);
966
967         hash = hash64(field, size);
968
969         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
970         if (r < 0)
971                 return r;
972         else if (r > 0) {
973
974                 if (ret)
975                         *ret = o;
976
977                 if (offset)
978                         *offset = p;
979
980                 return 0;
981         }
982
983         osize = offsetof(Object, field.payload) + size;
984         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
985         if (r < 0)
986                 return r;
987
988         o->field.hash = htole64(hash);
989         memcpy(o->field.payload, field, size);
990
991         r = journal_file_link_field(f, o, p, hash);
992         if (r < 0)
993                 return r;
994
995         /* The linking might have altered the window, so let's
996          * refresh our pointer */
997         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
998         if (r < 0)
999                 return r;
1000
1001 #ifdef HAVE_GCRYPT
1002         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1003         if (r < 0)
1004                 return r;
1005 #endif
1006
1007         if (ret)
1008                 *ret = o;
1009
1010         if (offset)
1011                 *offset = p;
1012
1013         return 0;
1014 }
1015
1016 static int journal_file_append_data(
1017                 JournalFile *f,
1018                 const void *data, uint64_t size,
1019                 Object **ret, uint64_t *offset) {
1020
1021         uint64_t hash, p;
1022         uint64_t osize;
1023         Object *o;
1024         int r, compression = 0;
1025         const void *eq;
1026
1027         assert(f);
1028         assert(data || size == 0);
1029
1030         hash = hash64(data, size);
1031
1032         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1033         if (r < 0)
1034                 return r;
1035         else if (r > 0) {
1036
1037                 if (ret)
1038                         *ret = o;
1039
1040                 if (offset)
1041                         *offset = p;
1042
1043                 return 0;
1044         }
1045
1046         osize = offsetof(Object, data.payload) + size;
1047         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1048         if (r < 0)
1049                 return r;
1050
1051         o->data.hash = htole64(hash);
1052
1053 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1054         if (f->compress_xz &&
1055             size >= COMPRESSION_SIZE_THRESHOLD) {
1056                 size_t rsize;
1057
1058                 compression = compress_blob(data, size, o->data.payload, &rsize);
1059
1060                 if (compression) {
1061                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1062                         o->object.flags |= compression;
1063
1064                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1065                                   size, rsize, object_compressed_to_string(compression));
1066                 }
1067         }
1068 #endif
1069
1070         if (!compression && size > 0)
1071                 memcpy(o->data.payload, data, size);
1072
1073         r = journal_file_link_data(f, o, p, hash);
1074         if (r < 0)
1075                 return r;
1076
1077         /* The linking might have altered the window, so let's
1078          * refresh our pointer */
1079         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1080         if (r < 0)
1081                 return r;
1082
1083         if (!data)
1084                 eq = NULL;
1085         else
1086                 eq = memchr(data, '=', size);
1087         if (eq && eq > data) {
1088                 Object *fo = NULL;
1089                 uint64_t fp;
1090
1091                 /* Create field object ... */
1092                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1093                 if (r < 0)
1094                         return r;
1095
1096                 /* ... and link it in. */
1097                 o->data.next_field_offset = fo->field.head_data_offset;
1098                 fo->field.head_data_offset = le64toh(p);
1099         }
1100
1101 #ifdef HAVE_GCRYPT
1102         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1103         if (r < 0)
1104                 return r;
1105 #endif
1106
1107         if (ret)
1108                 *ret = o;
1109
1110         if (offset)
1111                 *offset = p;
1112
1113         return 0;
1114 }
1115
1116 uint64_t journal_file_entry_n_items(Object *o) {
1117         assert(o);
1118
1119         if (o->object.type != OBJECT_ENTRY)
1120                 return 0;
1121
1122         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1123 }
1124
1125 uint64_t journal_file_entry_array_n_items(Object *o) {
1126         assert(o);
1127
1128         if (o->object.type != OBJECT_ENTRY_ARRAY)
1129                 return 0;
1130
1131         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1132 }
1133
1134 uint64_t journal_file_hash_table_n_items(Object *o) {
1135         assert(o);
1136
1137         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1138             o->object.type != OBJECT_FIELD_HASH_TABLE)
1139                 return 0;
1140
1141         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1142 }
1143
1144 static int link_entry_into_array(JournalFile *f,
1145                                  le64_t *first,
1146                                  le64_t *idx,
1147                                  uint64_t p) {
1148         int r;
1149         uint64_t n = 0, ap = 0, q, i, a, hidx;
1150         Object *o;
1151
1152         assert(f);
1153         assert(first);
1154         assert(idx);
1155         assert(p > 0);
1156
1157         a = le64toh(*first);
1158         i = hidx = le64toh(*idx);
1159         while (a > 0) {
1160
1161                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1162                 if (r < 0)
1163                         return r;
1164
1165                 n = journal_file_entry_array_n_items(o);
1166                 if (i < n) {
1167                         o->entry_array.items[i] = htole64(p);
1168                         *idx = htole64(hidx + 1);
1169                         return 0;
1170                 }
1171
1172                 i -= n;
1173                 ap = a;
1174                 a = le64toh(o->entry_array.next_entry_array_offset);
1175         }
1176
1177         if (hidx > n)
1178                 n = (hidx+1) * 2;
1179         else
1180                 n = n * 2;
1181
1182         if (n < 4)
1183                 n = 4;
1184
1185         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1186                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1187                                        &o, &q);
1188         if (r < 0)
1189                 return r;
1190
1191 #ifdef HAVE_GCRYPT
1192         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1193         if (r < 0)
1194                 return r;
1195 #endif
1196
1197         o->entry_array.items[i] = htole64(p);
1198
1199         if (ap == 0)
1200                 *first = htole64(q);
1201         else {
1202                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1203                 if (r < 0)
1204                         return r;
1205
1206                 o->entry_array.next_entry_array_offset = htole64(q);
1207         }
1208
1209         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1210                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1211
1212         *idx = htole64(hidx + 1);
1213
1214         return 0;
1215 }
1216
1217 static int link_entry_into_array_plus_one(JournalFile *f,
1218                                           le64_t *extra,
1219                                           le64_t *first,
1220                                           le64_t *idx,
1221                                           uint64_t p) {
1222
1223         int r;
1224
1225         assert(f);
1226         assert(extra);
1227         assert(first);
1228         assert(idx);
1229         assert(p > 0);
1230
1231         if (*idx == 0)
1232                 *extra = htole64(p);
1233         else {
1234                 le64_t i;
1235
1236                 i = htole64(le64toh(*idx) - 1);
1237                 r = link_entry_into_array(f, first, &i, p);
1238                 if (r < 0)
1239                         return r;
1240         }
1241
1242         *idx = htole64(le64toh(*idx) + 1);
1243         return 0;
1244 }
1245
1246 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1247         uint64_t p;
1248         int r;
1249         assert(f);
1250         assert(o);
1251         assert(offset > 0);
1252
1253         p = le64toh(o->entry.items[i].object_offset);
1254         if (p == 0)
1255                 return -EINVAL;
1256
1257         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1258         if (r < 0)
1259                 return r;
1260
1261         return link_entry_into_array_plus_one(f,
1262                                               &o->data.entry_offset,
1263                                               &o->data.entry_array_offset,
1264                                               &o->data.n_entries,
1265                                               offset);
1266 }
1267
1268 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1269         uint64_t n, i;
1270         int r;
1271
1272         assert(f);
1273         assert(o);
1274         assert(offset > 0);
1275
1276         if (o->object.type != OBJECT_ENTRY)
1277                 return -EINVAL;
1278
1279         __sync_synchronize();
1280
1281         /* Link up the entry itself */
1282         r = link_entry_into_array(f,
1283                                   &f->header->entry_array_offset,
1284                                   &f->header->n_entries,
1285                                   offset);
1286         if (r < 0)
1287                 return r;
1288
1289         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1290
1291         if (f->header->head_entry_realtime == 0)
1292                 f->header->head_entry_realtime = o->entry.realtime;
1293
1294         f->header->tail_entry_realtime = o->entry.realtime;
1295         f->header->tail_entry_monotonic = o->entry.monotonic;
1296
1297         f->tail_entry_monotonic_valid = true;
1298
1299         /* Link up the items */
1300         n = journal_file_entry_n_items(o);
1301         for (i = 0; i < n; i++) {
1302                 r = journal_file_link_entry_item(f, o, offset, i);
1303                 if (r < 0)
1304                         return r;
1305         }
1306
1307         return 0;
1308 }
1309
1310 static int journal_file_append_entry_internal(
1311                 JournalFile *f,
1312                 const dual_timestamp *ts,
1313                 uint64_t xor_hash,
1314                 const EntryItem items[], unsigned n_items,
1315                 uint64_t *seqnum,
1316                 Object **ret, uint64_t *offset) {
1317         uint64_t np;
1318         uint64_t osize;
1319         Object *o;
1320         int r;
1321
1322         assert(f);
1323         assert(items || n_items == 0);
1324         assert(ts);
1325
1326         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1327
1328         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1329         if (r < 0)
1330                 return r;
1331
1332         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1333         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1334         o->entry.realtime = htole64(ts->realtime);
1335         o->entry.monotonic = htole64(ts->monotonic);
1336         o->entry.xor_hash = htole64(xor_hash);
1337         o->entry.boot_id = f->header->boot_id;
1338
1339 #ifdef HAVE_GCRYPT
1340         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1341         if (r < 0)
1342                 return r;
1343 #endif
1344
1345         r = journal_file_link_entry(f, o, np);
1346         if (r < 0)
1347                 return r;
1348
1349         if (ret)
1350                 *ret = o;
1351
1352         if (offset)
1353                 *offset = np;
1354
1355         return 0;
1356 }
1357
1358 void journal_file_post_change(JournalFile *f) {
1359         assert(f);
1360
1361         /* inotify() does not receive IN_MODIFY events from file
1362          * accesses done via mmap(). After each access we hence
1363          * trigger IN_MODIFY by truncating the journal file to its
1364          * current size which triggers IN_MODIFY. */
1365
1366         __sync_synchronize();
1367
1368         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1369                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1370 }
1371
1372 static int entry_item_cmp(const void *_a, const void *_b) {
1373         const EntryItem *a = _a, *b = _b;
1374
1375         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1376                 return -1;
1377         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1378                 return 1;
1379         return 0;
1380 }
1381
1382 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1383         unsigned i;
1384         EntryItem *items;
1385         int r;
1386         uint64_t xor_hash = 0;
1387         struct dual_timestamp _ts;
1388
1389         assert(f);
1390         assert(iovec || n_iovec == 0);
1391
1392         if (!ts) {
1393                 dual_timestamp_get(&_ts);
1394                 ts = &_ts;
1395         }
1396
1397         if (f->tail_entry_monotonic_valid &&
1398             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1399                 return -EINVAL;
1400
1401 #ifdef HAVE_GCRYPT
1402         r = journal_file_maybe_append_tag(f, ts->realtime);
1403         if (r < 0)
1404                 return r;
1405 #endif
1406
1407         /* alloca() can't take 0, hence let's allocate at least one */
1408         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1409
1410         for (i = 0; i < n_iovec; i++) {
1411                 uint64_t p;
1412                 Object *o;
1413
1414                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1415                 if (r < 0)
1416                         return r;
1417
1418                 xor_hash ^= le64toh(o->data.hash);
1419                 items[i].object_offset = htole64(p);
1420                 items[i].hash = o->data.hash;
1421         }
1422
1423         /* Order by the position on disk, in order to improve seek
1424          * times for rotating media. */
1425         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1426
1427         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1428
1429         /* If the memory mapping triggered a SIGBUS then we return an
1430          * IO error and ignore the error code passed down to us, since
1431          * it is very likely just an effect of a nullified replacement
1432          * mapping page */
1433
1434         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1435                 r = -EIO;
1436
1437         journal_file_post_change(f);
1438
1439         return r;
1440 }
1441
1442 typedef struct ChainCacheItem {
1443         uint64_t first; /* the array at the beginning of the chain */
1444         uint64_t array; /* the cached array */
1445         uint64_t begin; /* the first item in the cached array */
1446         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1447         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1448 } ChainCacheItem;
1449
1450 static void chain_cache_put(
1451                 OrderedHashmap *h,
1452                 ChainCacheItem *ci,
1453                 uint64_t first,
1454                 uint64_t array,
1455                 uint64_t begin,
1456                 uint64_t total,
1457                 uint64_t last_index) {
1458
1459         if (!ci) {
1460                 /* If the chain item to cache for this chain is the
1461                  * first one it's not worth caching anything */
1462                 if (array == first)
1463                         return;
1464
1465                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1466                         ci = ordered_hashmap_steal_first(h);
1467                         assert(ci);
1468                 } else {
1469                         ci = new(ChainCacheItem, 1);
1470                         if (!ci)
1471                                 return;
1472                 }
1473
1474                 ci->first = first;
1475
1476                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1477                         free(ci);
1478                         return;
1479                 }
1480         } else
1481                 assert(ci->first == first);
1482
1483         ci->array = array;
1484         ci->begin = begin;
1485         ci->total = total;
1486         ci->last_index = last_index;
1487 }
1488
1489 static int generic_array_get(
1490                 JournalFile *f,
1491                 uint64_t first,
1492                 uint64_t i,
1493                 Object **ret, uint64_t *offset) {
1494
1495         Object *o;
1496         uint64_t p = 0, a, t = 0;
1497         int r;
1498         ChainCacheItem *ci;
1499
1500         assert(f);
1501
1502         a = first;
1503
1504         /* Try the chain cache first */
1505         ci = ordered_hashmap_get(f->chain_cache, &first);
1506         if (ci && i > ci->total) {
1507                 a = ci->array;
1508                 i -= ci->total;
1509                 t = ci->total;
1510         }
1511
1512         while (a > 0) {
1513                 uint64_t k;
1514
1515                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1516                 if (r < 0)
1517                         return r;
1518
1519                 k = journal_file_entry_array_n_items(o);
1520                 if (i < k) {
1521                         p = le64toh(o->entry_array.items[i]);
1522                         goto found;
1523                 }
1524
1525                 i -= k;
1526                 t += k;
1527                 a = le64toh(o->entry_array.next_entry_array_offset);
1528         }
1529
1530         return 0;
1531
1532 found:
1533         /* Let's cache this item for the next invocation */
1534         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1535
1536         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1537         if (r < 0)
1538                 return r;
1539
1540         if (ret)
1541                 *ret = o;
1542
1543         if (offset)
1544                 *offset = p;
1545
1546         return 1;
1547 }
1548
1549 static int generic_array_get_plus_one(
1550                 JournalFile *f,
1551                 uint64_t extra,
1552                 uint64_t first,
1553                 uint64_t i,
1554                 Object **ret, uint64_t *offset) {
1555
1556         Object *o;
1557
1558         assert(f);
1559
1560         if (i == 0) {
1561                 int r;
1562
1563                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1564                 if (r < 0)
1565                         return r;
1566
1567                 if (ret)
1568                         *ret = o;
1569
1570                 if (offset)
1571                         *offset = extra;
1572
1573                 return 1;
1574         }
1575
1576         return generic_array_get(f, first, i-1, ret, offset);
1577 }
1578
1579 enum {
1580         TEST_FOUND,
1581         TEST_LEFT,
1582         TEST_RIGHT
1583 };
1584
1585 static int generic_array_bisect(
1586                 JournalFile *f,
1587                 uint64_t first,
1588                 uint64_t n,
1589                 uint64_t needle,
1590                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1591                 direction_t direction,
1592                 Object **ret,
1593                 uint64_t *offset,
1594                 uint64_t *idx) {
1595
1596         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1597         bool subtract_one = false;
1598         Object *o, *array = NULL;
1599         int r;
1600         ChainCacheItem *ci;
1601
1602         assert(f);
1603         assert(test_object);
1604
1605         /* Start with the first array in the chain */
1606         a = first;
1607
1608         ci = ordered_hashmap_get(f->chain_cache, &first);
1609         if (ci && n > ci->total) {
1610                 /* Ah, we have iterated this bisection array chain
1611                  * previously! Let's see if we can skip ahead in the
1612                  * chain, as far as the last time. But we can't jump
1613                  * backwards in the chain, so let's check that
1614                  * first. */
1615
1616                 r = test_object(f, ci->begin, needle);
1617                 if (r < 0)
1618                         return r;
1619
1620                 if (r == TEST_LEFT) {
1621                         /* OK, what we are looking for is right of the
1622                          * begin of this EntryArray, so let's jump
1623                          * straight to previously cached array in the
1624                          * chain */
1625
1626                         a = ci->array;
1627                         n -= ci->total;
1628                         t = ci->total;
1629                         last_index = ci->last_index;
1630                 }
1631         }
1632
1633         while (a > 0) {
1634                 uint64_t left, right, k, lp;
1635
1636                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1637                 if (r < 0)
1638                         return r;
1639
1640                 k = journal_file_entry_array_n_items(array);
1641                 right = MIN(k, n);
1642                 if (right <= 0)
1643                         return 0;
1644
1645                 i = right - 1;
1646                 lp = p = le64toh(array->entry_array.items[i]);
1647                 if (p <= 0)
1648                         return -EBADMSG;
1649
1650                 r = test_object(f, p, needle);
1651                 if (r < 0)
1652                         return r;
1653
1654                 if (r == TEST_FOUND)
1655                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1656
1657                 if (r == TEST_RIGHT) {
1658                         left = 0;
1659                         right -= 1;
1660
1661                         if (last_index != (uint64_t) -1) {
1662                                 assert(last_index <= right);
1663
1664                                 /* If we cached the last index we
1665                                  * looked at, let's try to not to jump
1666                                  * too wildly around and see if we can
1667                                  * limit the range to look at early to
1668                                  * the immediate neighbors of the last
1669                                  * index we looked at. */
1670
1671                                 if (last_index > 0) {
1672                                         uint64_t x = last_index - 1;
1673
1674                                         p = le64toh(array->entry_array.items[x]);
1675                                         if (p <= 0)
1676                                                 return -EBADMSG;
1677
1678                                         r = test_object(f, p, needle);
1679                                         if (r < 0)
1680                                                 return r;
1681
1682                                         if (r == TEST_FOUND)
1683                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1684
1685                                         if (r == TEST_RIGHT)
1686                                                 right = x;
1687                                         else
1688                                                 left = x + 1;
1689                                 }
1690
1691                                 if (last_index < right) {
1692                                         uint64_t y = last_index + 1;
1693
1694                                         p = le64toh(array->entry_array.items[y]);
1695                                         if (p <= 0)
1696                                                 return -EBADMSG;
1697
1698                                         r = test_object(f, p, needle);
1699                                         if (r < 0)
1700                                                 return r;
1701
1702                                         if (r == TEST_FOUND)
1703                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1704
1705                                         if (r == TEST_RIGHT)
1706                                                 right = y;
1707                                         else
1708                                                 left = y + 1;
1709                                 }
1710                         }
1711
1712                         for (;;) {
1713                                 if (left == right) {
1714                                         if (direction == DIRECTION_UP)
1715                                                 subtract_one = true;
1716
1717                                         i = left;
1718                                         goto found;
1719                                 }
1720
1721                                 assert(left < right);
1722                                 i = (left + right) / 2;
1723
1724                                 p = le64toh(array->entry_array.items[i]);
1725                                 if (p <= 0)
1726                                         return -EBADMSG;
1727
1728                                 r = test_object(f, p, needle);
1729                                 if (r < 0)
1730                                         return r;
1731
1732                                 if (r == TEST_FOUND)
1733                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1734
1735                                 if (r == TEST_RIGHT)
1736                                         right = i;
1737                                 else
1738                                         left = i + 1;
1739                         }
1740                 }
1741
1742                 if (k >= n) {
1743                         if (direction == DIRECTION_UP) {
1744                                 i = n;
1745                                 subtract_one = true;
1746                                 goto found;
1747                         }
1748
1749                         return 0;
1750                 }
1751
1752                 last_p = lp;
1753
1754                 n -= k;
1755                 t += k;
1756                 last_index = (uint64_t) -1;
1757                 a = le64toh(array->entry_array.next_entry_array_offset);
1758         }
1759
1760         return 0;
1761
1762 found:
1763         if (subtract_one && t == 0 && i == 0)
1764                 return 0;
1765
1766         /* Let's cache this item for the next invocation */
1767         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1768
1769         if (subtract_one && i == 0)
1770                 p = last_p;
1771         else if (subtract_one)
1772                 p = le64toh(array->entry_array.items[i-1]);
1773         else
1774                 p = le64toh(array->entry_array.items[i]);
1775
1776         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1777         if (r < 0)
1778                 return r;
1779
1780         if (ret)
1781                 *ret = o;
1782
1783         if (offset)
1784                 *offset = p;
1785
1786         if (idx)
1787                 *idx = t + i + (subtract_one ? -1 : 0);
1788
1789         return 1;
1790 }
1791
1792 static int generic_array_bisect_plus_one(
1793                 JournalFile *f,
1794                 uint64_t extra,
1795                 uint64_t first,
1796                 uint64_t n,
1797                 uint64_t needle,
1798                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1799                 direction_t direction,
1800                 Object **ret,
1801                 uint64_t *offset,
1802                 uint64_t *idx) {
1803
1804         int r;
1805         bool step_back = false;
1806         Object *o;
1807
1808         assert(f);
1809         assert(test_object);
1810
1811         if (n <= 0)
1812                 return 0;
1813
1814         /* This bisects the array in object 'first', but first checks
1815          * an extra  */
1816         r = test_object(f, extra, needle);
1817         if (r < 0)
1818                 return r;
1819
1820         if (r == TEST_FOUND)
1821                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1822
1823         /* if we are looking with DIRECTION_UP then we need to first
1824            see if in the actual array there is a matching entry, and
1825            return the last one of that. But if there isn't any we need
1826            to return this one. Hence remember this, and return it
1827            below. */
1828         if (r == TEST_LEFT)
1829                 step_back = direction == DIRECTION_UP;
1830
1831         if (r == TEST_RIGHT) {
1832                 if (direction == DIRECTION_DOWN)
1833                         goto found;
1834                 else
1835                         return 0;
1836         }
1837
1838         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1839
1840         if (r == 0 && step_back)
1841                 goto found;
1842
1843         if (r > 0 && idx)
1844                 (*idx) ++;
1845
1846         return r;
1847
1848 found:
1849         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1850         if (r < 0)
1851                 return r;
1852
1853         if (ret)
1854                 *ret = o;
1855
1856         if (offset)
1857                 *offset = extra;
1858
1859         if (idx)
1860                 *idx = 0;
1861
1862         return 1;
1863 }
1864
1865 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1866         assert(f);
1867         assert(p > 0);
1868
1869         if (p == needle)
1870                 return TEST_FOUND;
1871         else if (p < needle)
1872                 return TEST_LEFT;
1873         else
1874                 return TEST_RIGHT;
1875 }
1876
1877 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1878         Object *o;
1879         int r;
1880
1881         assert(f);
1882         assert(p > 0);
1883
1884         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1885         if (r < 0)
1886                 return r;
1887
1888         if (le64toh(o->entry.seqnum) == needle)
1889                 return TEST_FOUND;
1890         else if (le64toh(o->entry.seqnum) < needle)
1891                 return TEST_LEFT;
1892         else
1893                 return TEST_RIGHT;
1894 }
1895
1896 int journal_file_move_to_entry_by_seqnum(
1897                 JournalFile *f,
1898                 uint64_t seqnum,
1899                 direction_t direction,
1900                 Object **ret,
1901                 uint64_t *offset) {
1902
1903         return generic_array_bisect(f,
1904                                     le64toh(f->header->entry_array_offset),
1905                                     le64toh(f->header->n_entries),
1906                                     seqnum,
1907                                     test_object_seqnum,
1908                                     direction,
1909                                     ret, offset, NULL);
1910 }
1911
1912 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1913         Object *o;
1914         int r;
1915
1916         assert(f);
1917         assert(p > 0);
1918
1919         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1920         if (r < 0)
1921                 return r;
1922
1923         if (le64toh(o->entry.realtime) == needle)
1924                 return TEST_FOUND;
1925         else if (le64toh(o->entry.realtime) < needle)
1926                 return TEST_LEFT;
1927         else
1928                 return TEST_RIGHT;
1929 }
1930
1931 int journal_file_move_to_entry_by_realtime(
1932                 JournalFile *f,
1933                 uint64_t realtime,
1934                 direction_t direction,
1935                 Object **ret,
1936                 uint64_t *offset) {
1937
1938         return generic_array_bisect(f,
1939                                     le64toh(f->header->entry_array_offset),
1940                                     le64toh(f->header->n_entries),
1941                                     realtime,
1942                                     test_object_realtime,
1943                                     direction,
1944                                     ret, offset, NULL);
1945 }
1946
1947 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1948         Object *o;
1949         int r;
1950
1951         assert(f);
1952         assert(p > 0);
1953
1954         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1955         if (r < 0)
1956                 return r;
1957
1958         if (le64toh(o->entry.monotonic) == needle)
1959                 return TEST_FOUND;
1960         else if (le64toh(o->entry.monotonic) < needle)
1961                 return TEST_LEFT;
1962         else
1963                 return TEST_RIGHT;
1964 }
1965
1966 static inline int find_data_object_by_boot_id(
1967                 JournalFile *f,
1968                 sd_id128_t boot_id,
1969                 Object **o,
1970                 uint64_t *b) {
1971         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1972
1973         sd_id128_to_string(boot_id, t + 9);
1974         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1975 }
1976
1977 int journal_file_move_to_entry_by_monotonic(
1978                 JournalFile *f,
1979                 sd_id128_t boot_id,
1980                 uint64_t monotonic,
1981                 direction_t direction,
1982                 Object **ret,
1983                 uint64_t *offset) {
1984
1985         Object *o;
1986         int r;
1987
1988         assert(f);
1989
1990         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1991         if (r < 0)
1992                 return r;
1993         if (r == 0)
1994                 return -ENOENT;
1995
1996         return generic_array_bisect_plus_one(f,
1997                                              le64toh(o->data.entry_offset),
1998                                              le64toh(o->data.entry_array_offset),
1999                                              le64toh(o->data.n_entries),
2000                                              monotonic,
2001                                              test_object_monotonic,
2002                                              direction,
2003                                              ret, offset, NULL);
2004 }
2005
2006 void journal_file_reset_location(JournalFile *f) {
2007         f->location_type = LOCATION_HEAD;
2008         f->current_offset = 0;
2009         f->current_seqnum = 0;
2010         f->current_realtime = 0;
2011         f->current_monotonic = 0;
2012         zero(f->current_boot_id);
2013         f->current_xor_hash = 0;
2014 }
2015
2016 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2017         f->last_direction = direction;
2018         f->location_type = LOCATION_SEEK;
2019         f->current_offset = offset;
2020         f->current_seqnum = le64toh(o->entry.seqnum);
2021         f->current_realtime = le64toh(o->entry.realtime);
2022         f->current_monotonic = le64toh(o->entry.monotonic);
2023         f->current_boot_id = o->entry.boot_id;
2024         f->current_xor_hash = le64toh(o->entry.xor_hash);
2025 }
2026
2027 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2028         assert(af);
2029         assert(bf);
2030         assert(af->location_type == LOCATION_SEEK);
2031         assert(bf->location_type == LOCATION_SEEK);
2032
2033         /* If contents and timestamps match, these entries are
2034          * identical, even if the seqnum does not match */
2035         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2036             af->current_monotonic == bf->current_monotonic &&
2037             af->current_realtime == bf->current_realtime &&
2038             af->current_xor_hash == bf->current_xor_hash)
2039                 return 0;
2040
2041         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2042
2043                 /* If this is from the same seqnum source, compare
2044                  * seqnums */
2045                 if (af->current_seqnum < bf->current_seqnum)
2046                         return -1;
2047                 if (af->current_seqnum > bf->current_seqnum)
2048                         return 1;
2049
2050                 /* Wow! This is weird, different data but the same
2051                  * seqnums? Something is borked, but let's make the
2052                  * best of it and compare by time. */
2053         }
2054
2055         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2056
2057                 /* If the boot id matches, compare monotonic time */
2058                 if (af->current_monotonic < bf->current_monotonic)
2059                         return -1;
2060                 if (af->current_monotonic > bf->current_monotonic)
2061                         return 1;
2062         }
2063
2064         /* Otherwise, compare UTC time */
2065         if (af->current_realtime < bf->current_realtime)
2066                 return -1;
2067         if (af->current_realtime > bf->current_realtime)
2068                 return 1;
2069
2070         /* Finally, compare by contents */
2071         if (af->current_xor_hash < bf->current_xor_hash)
2072                 return -1;
2073         if (af->current_xor_hash > bf->current_xor_hash)
2074                 return 1;
2075
2076         return 0;
2077 }
2078
2079 int journal_file_next_entry(
2080                 JournalFile *f,
2081                 uint64_t p,
2082                 direction_t direction,
2083                 Object **ret, uint64_t *offset) {
2084
2085         uint64_t i, n, ofs;
2086         int r;
2087
2088         assert(f);
2089
2090         n = le64toh(f->header->n_entries);
2091         if (n <= 0)
2092                 return 0;
2093
2094         if (p == 0)
2095                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2096         else {
2097                 r = generic_array_bisect(f,
2098                                          le64toh(f->header->entry_array_offset),
2099                                          le64toh(f->header->n_entries),
2100                                          p,
2101                                          test_object_offset,
2102                                          DIRECTION_DOWN,
2103                                          NULL, NULL,
2104                                          &i);
2105                 if (r <= 0)
2106                         return r;
2107
2108                 if (direction == DIRECTION_DOWN) {
2109                         if (i >= n - 1)
2110                                 return 0;
2111
2112                         i++;
2113                 } else {
2114                         if (i <= 0)
2115                                 return 0;
2116
2117                         i--;
2118                 }
2119         }
2120
2121         /* And jump to it */
2122         r = generic_array_get(f,
2123                               le64toh(f->header->entry_array_offset),
2124                               i,
2125                               ret, &ofs);
2126         if (r <= 0)
2127                 return r;
2128
2129         if (p > 0 &&
2130             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2131                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2132                           f->path, i);
2133                 return -EBADMSG;
2134         }
2135
2136         if (offset)
2137                 *offset = ofs;
2138
2139         return 1;
2140 }
2141
2142 int journal_file_next_entry_for_data(
2143                 JournalFile *f,
2144                 Object *o, uint64_t p,
2145                 uint64_t data_offset,
2146                 direction_t direction,
2147                 Object **ret, uint64_t *offset) {
2148
2149         uint64_t n, i;
2150         int r;
2151         Object *d;
2152
2153         assert(f);
2154         assert(p > 0 || !o);
2155
2156         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2157         if (r < 0)
2158                 return r;
2159
2160         n = le64toh(d->data.n_entries);
2161         if (n <= 0)
2162                 return n;
2163
2164         if (!o)
2165                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2166         else {
2167                 if (o->object.type != OBJECT_ENTRY)
2168                         return -EINVAL;
2169
2170                 r = generic_array_bisect_plus_one(f,
2171                                                   le64toh(d->data.entry_offset),
2172                                                   le64toh(d->data.entry_array_offset),
2173                                                   le64toh(d->data.n_entries),
2174                                                   p,
2175                                                   test_object_offset,
2176                                                   DIRECTION_DOWN,
2177                                                   NULL, NULL,
2178                                                   &i);
2179
2180                 if (r <= 0)
2181                         return r;
2182
2183                 if (direction == DIRECTION_DOWN) {
2184                         if (i >= n - 1)
2185                                 return 0;
2186
2187                         i++;
2188                 } else {
2189                         if (i <= 0)
2190                                 return 0;
2191
2192                         i--;
2193                 }
2194
2195         }
2196
2197         return generic_array_get_plus_one(f,
2198                                           le64toh(d->data.entry_offset),
2199                                           le64toh(d->data.entry_array_offset),
2200                                           i,
2201                                           ret, offset);
2202 }
2203
2204 int journal_file_move_to_entry_by_offset_for_data(
2205                 JournalFile *f,
2206                 uint64_t data_offset,
2207                 uint64_t p,
2208                 direction_t direction,
2209                 Object **ret, uint64_t *offset) {
2210
2211         int r;
2212         Object *d;
2213
2214         assert(f);
2215
2216         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2217         if (r < 0)
2218                 return r;
2219
2220         return generic_array_bisect_plus_one(f,
2221                                              le64toh(d->data.entry_offset),
2222                                              le64toh(d->data.entry_array_offset),
2223                                              le64toh(d->data.n_entries),
2224                                              p,
2225                                              test_object_offset,
2226                                              direction,
2227                                              ret, offset, NULL);
2228 }
2229
2230 int journal_file_move_to_entry_by_monotonic_for_data(
2231                 JournalFile *f,
2232                 uint64_t data_offset,
2233                 sd_id128_t boot_id,
2234                 uint64_t monotonic,
2235                 direction_t direction,
2236                 Object **ret, uint64_t *offset) {
2237
2238         Object *o, *d;
2239         int r;
2240         uint64_t b, z;
2241
2242         assert(f);
2243
2244         /* First, seek by time */
2245         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2246         if (r < 0)
2247                 return r;
2248         if (r == 0)
2249                 return -ENOENT;
2250
2251         r = generic_array_bisect_plus_one(f,
2252                                           le64toh(o->data.entry_offset),
2253                                           le64toh(o->data.entry_array_offset),
2254                                           le64toh(o->data.n_entries),
2255                                           monotonic,
2256                                           test_object_monotonic,
2257                                           direction,
2258                                           NULL, &z, NULL);
2259         if (r <= 0)
2260                 return r;
2261
2262         /* And now, continue seeking until we find an entry that
2263          * exists in both bisection arrays */
2264
2265         for (;;) {
2266                 Object *qo;
2267                 uint64_t p, q;
2268
2269                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2270                 if (r < 0)
2271                         return r;
2272
2273                 r = generic_array_bisect_plus_one(f,
2274                                                   le64toh(d->data.entry_offset),
2275                                                   le64toh(d->data.entry_array_offset),
2276                                                   le64toh(d->data.n_entries),
2277                                                   z,
2278                                                   test_object_offset,
2279                                                   direction,
2280                                                   NULL, &p, NULL);
2281                 if (r <= 0)
2282                         return r;
2283
2284                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2285                 if (r < 0)
2286                         return r;
2287
2288                 r = generic_array_bisect_plus_one(f,
2289                                                   le64toh(o->data.entry_offset),
2290                                                   le64toh(o->data.entry_array_offset),
2291                                                   le64toh(o->data.n_entries),
2292                                                   p,
2293                                                   test_object_offset,
2294                                                   direction,
2295                                                   &qo, &q, NULL);
2296
2297                 if (r <= 0)
2298                         return r;
2299
2300                 if (p == q) {
2301                         if (ret)
2302                                 *ret = qo;
2303                         if (offset)
2304                                 *offset = q;
2305
2306                         return 1;
2307                 }
2308
2309                 z = q;
2310         }
2311 }
2312
2313 int journal_file_move_to_entry_by_seqnum_for_data(
2314                 JournalFile *f,
2315                 uint64_t data_offset,
2316                 uint64_t seqnum,
2317                 direction_t direction,
2318                 Object **ret, uint64_t *offset) {
2319
2320         Object *d;
2321         int r;
2322
2323         assert(f);
2324
2325         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2326         if (r < 0)
2327                 return r;
2328
2329         return generic_array_bisect_plus_one(f,
2330                                              le64toh(d->data.entry_offset),
2331                                              le64toh(d->data.entry_array_offset),
2332                                              le64toh(d->data.n_entries),
2333                                              seqnum,
2334                                              test_object_seqnum,
2335                                              direction,
2336                                              ret, offset, NULL);
2337 }
2338
2339 int journal_file_move_to_entry_by_realtime_for_data(
2340                 JournalFile *f,
2341                 uint64_t data_offset,
2342                 uint64_t realtime,
2343                 direction_t direction,
2344                 Object **ret, uint64_t *offset) {
2345
2346         Object *d;
2347         int r;
2348
2349         assert(f);
2350
2351         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2352         if (r < 0)
2353                 return r;
2354
2355         return generic_array_bisect_plus_one(f,
2356                                              le64toh(d->data.entry_offset),
2357                                              le64toh(d->data.entry_array_offset),
2358                                              le64toh(d->data.n_entries),
2359                                              realtime,
2360                                              test_object_realtime,
2361                                              direction,
2362                                              ret, offset, NULL);
2363 }
2364
2365 void journal_file_dump(JournalFile *f) {
2366         Object *o;
2367         int r;
2368         uint64_t p;
2369
2370         assert(f);
2371
2372         journal_file_print_header(f);
2373
2374         p = le64toh(f->header->header_size);
2375         while (p != 0) {
2376                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2377                 if (r < 0)
2378                         goto fail;
2379
2380                 switch (o->object.type) {
2381
2382                 case OBJECT_UNUSED:
2383                         printf("Type: OBJECT_UNUSED\n");
2384                         break;
2385
2386                 case OBJECT_DATA:
2387                         printf("Type: OBJECT_DATA\n");
2388                         break;
2389
2390                 case OBJECT_FIELD:
2391                         printf("Type: OBJECT_FIELD\n");
2392                         break;
2393
2394                 case OBJECT_ENTRY:
2395                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2396                                le64toh(o->entry.seqnum),
2397                                le64toh(o->entry.monotonic),
2398                                le64toh(o->entry.realtime));
2399                         break;
2400
2401                 case OBJECT_FIELD_HASH_TABLE:
2402                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2403                         break;
2404
2405                 case OBJECT_DATA_HASH_TABLE:
2406                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2407                         break;
2408
2409                 case OBJECT_ENTRY_ARRAY:
2410                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2411                         break;
2412
2413                 case OBJECT_TAG:
2414                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2415                                le64toh(o->tag.seqnum),
2416                                le64toh(o->tag.epoch));
2417                         break;
2418
2419                 default:
2420                         printf("Type: unknown (%i)\n", o->object.type);
2421                         break;
2422                 }
2423
2424                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2425                         printf("Flags: %s\n",
2426                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2427
2428                 if (p == le64toh(f->header->tail_object_offset))
2429                         p = 0;
2430                 else
2431                         p = p + ALIGN64(le64toh(o->object.size));
2432         }
2433
2434         return;
2435 fail:
2436         log_error("File corrupt");
2437 }
2438
2439 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2440         const char *x;
2441
2442         x = format_timestamp(buf, l, t);
2443         if (x)
2444                 return x;
2445         return " --- ";
2446 }
2447
2448 void journal_file_print_header(JournalFile *f) {
2449         char a[33], b[33], c[33], d[33];
2450         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2451         struct stat st;
2452         char bytes[FORMAT_BYTES_MAX];
2453
2454         assert(f);
2455
2456         printf("File Path: %s\n"
2457                "File ID: %s\n"
2458                "Machine ID: %s\n"
2459                "Boot ID: %s\n"
2460                "Sequential Number ID: %s\n"
2461                "State: %s\n"
2462                "Compatible Flags:%s%s\n"
2463                "Incompatible Flags:%s%s%s\n"
2464                "Header size: %"PRIu64"\n"
2465                "Arena size: %"PRIu64"\n"
2466                "Data Hash Table Size: %"PRIu64"\n"
2467                "Field Hash Table Size: %"PRIu64"\n"
2468                "Rotate Suggested: %s\n"
2469                "Head Sequential Number: %"PRIu64"\n"
2470                "Tail Sequential Number: %"PRIu64"\n"
2471                "Head Realtime Timestamp: %s\n"
2472                "Tail Realtime Timestamp: %s\n"
2473                "Tail Monotonic Timestamp: %s\n"
2474                "Objects: %"PRIu64"\n"
2475                "Entry Objects: %"PRIu64"\n",
2476                f->path,
2477                sd_id128_to_string(f->header->file_id, a),
2478                sd_id128_to_string(f->header->machine_id, b),
2479                sd_id128_to_string(f->header->boot_id, c),
2480                sd_id128_to_string(f->header->seqnum_id, d),
2481                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2482                f->header->state == STATE_ONLINE ? "ONLINE" :
2483                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2484                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2485                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2486                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2487                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2488                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2489                le64toh(f->header->header_size),
2490                le64toh(f->header->arena_size),
2491                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2492                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2493                yes_no(journal_file_rotate_suggested(f, 0)),
2494                le64toh(f->header->head_entry_seqnum),
2495                le64toh(f->header->tail_entry_seqnum),
2496                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2497                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2498                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2499                le64toh(f->header->n_objects),
2500                le64toh(f->header->n_entries));
2501
2502         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2503                 printf("Data Objects: %"PRIu64"\n"
2504                        "Data Hash Table Fill: %.1f%%\n",
2505                        le64toh(f->header->n_data),
2506                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2507
2508         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2509                 printf("Field Objects: %"PRIu64"\n"
2510                        "Field Hash Table Fill: %.1f%%\n",
2511                        le64toh(f->header->n_fields),
2512                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2513
2514         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2515                 printf("Tag Objects: %"PRIu64"\n",
2516                        le64toh(f->header->n_tags));
2517         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2518                 printf("Entry Array Objects: %"PRIu64"\n",
2519                        le64toh(f->header->n_entry_arrays));
2520
2521         if (fstat(f->fd, &st) >= 0)
2522                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2523 }
2524
2525 int journal_file_open(
2526                 const char *fname,
2527                 int flags,
2528                 mode_t mode,
2529                 bool compress,
2530                 bool seal,
2531                 JournalMetrics *metrics,
2532                 MMapCache *mmap_cache,
2533                 JournalFile *template,
2534                 JournalFile **ret) {
2535
2536         bool newly_created = false;
2537         JournalFile *f;
2538         void *h;
2539         int r;
2540
2541         assert(fname);
2542         assert(ret);
2543
2544         if ((flags & O_ACCMODE) != O_RDONLY &&
2545             (flags & O_ACCMODE) != O_RDWR)
2546                 return -EINVAL;
2547
2548         if (!endswith(fname, ".journal") &&
2549             !endswith(fname, ".journal~"))
2550                 return -EINVAL;
2551
2552         f = new0(JournalFile, 1);
2553         if (!f)
2554                 return -ENOMEM;
2555
2556         f->fd = -1;
2557         f->mode = mode;
2558
2559         f->flags = flags;
2560         f->prot = prot_from_flags(flags);
2561         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2562 #if defined(HAVE_LZ4)
2563         f->compress_lz4 = compress;
2564 #elif defined(HAVE_XZ)
2565         f->compress_xz = compress;
2566 #endif
2567 #ifdef HAVE_GCRYPT
2568         f->seal = seal;
2569 #endif
2570
2571         if (mmap_cache)
2572                 f->mmap = mmap_cache_ref(mmap_cache);
2573         else {
2574                 f->mmap = mmap_cache_new();
2575                 if (!f->mmap) {
2576                         r = -ENOMEM;
2577                         goto fail;
2578                 }
2579         }
2580
2581         f->path = strdup(fname);
2582         if (!f->path) {
2583                 r = -ENOMEM;
2584                 goto fail;
2585         }
2586
2587         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2588         if (!f->chain_cache) {
2589                 r = -ENOMEM;
2590                 goto fail;
2591         }
2592
2593         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2594         if (f->fd < 0) {
2595                 r = -errno;
2596                 goto fail;
2597         }
2598
2599         r = journal_file_fstat(f);
2600         if (r < 0)
2601                 goto fail;
2602
2603         if (f->last_stat.st_size == 0 && f->writable) {
2604
2605                 /* Before we write anything, turn off COW logic. Given
2606                  * our write pattern that is quite unfriendly to COW
2607                  * file systems this should greatly improve
2608                  * performance on COW file systems, such as btrfs, at
2609                  * the expense of data integrity features (which
2610                  * shouldn't be too bad, given that we do our own
2611                  * checksumming). */
2612                 r = chattr_fd(f->fd, true, FS_NOCOW_FL);
2613                 if (r < 0)
2614                         log_warning_errno(errno, "Failed to set file attributes: %m");
2615
2616                 /* Let's attach the creation time to the journal file,
2617                  * so that the vacuuming code knows the age of this
2618                  * file even if the file might end up corrupted one
2619                  * day... Ideally we'd just use the creation time many
2620                  * file systems maintain for each file, but there is
2621                  * currently no usable API to query this, hence let's
2622                  * emulate this via extended attributes. If extended
2623                  * attributes are not supported we'll just skip this,
2624                  * and rely solely on mtime/atime/ctime of the file. */
2625
2626                 fd_setcrtime(f->fd, 0);
2627
2628 #ifdef HAVE_GCRYPT
2629                 /* Try to load the FSPRG state, and if we can't, then
2630                  * just don't do sealing */
2631                 if (f->seal) {
2632                         r = journal_file_fss_load(f);
2633                         if (r < 0)
2634                                 f->seal = false;
2635                 }
2636 #endif
2637
2638                 r = journal_file_init_header(f, template);
2639                 if (r < 0)
2640                         goto fail;
2641
2642                 r = journal_file_fstat(f);
2643                 if (r < 0)
2644                         goto fail;
2645
2646                 newly_created = true;
2647         }
2648
2649         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2650                 r = -EIO;
2651                 goto fail;
2652         }
2653
2654         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2655         if (r < 0) {
2656                 r = -errno;
2657                 goto fail;
2658         }
2659
2660         f->header = h;
2661
2662         if (!newly_created) {
2663                 r = journal_file_verify_header(f);
2664                 if (r < 0)
2665                         goto fail;
2666         }
2667
2668 #ifdef HAVE_GCRYPT
2669         if (!newly_created && f->writable) {
2670                 r = journal_file_fss_load(f);
2671                 if (r < 0)
2672                         goto fail;
2673         }
2674 #endif
2675
2676         if (f->writable) {
2677                 if (metrics) {
2678                         journal_default_metrics(metrics, f->fd);
2679                         f->metrics = *metrics;
2680                 } else if (template)
2681                         f->metrics = template->metrics;
2682
2683                 r = journal_file_refresh_header(f);
2684                 if (r < 0)
2685                         goto fail;
2686         }
2687
2688 #ifdef HAVE_GCRYPT
2689         r = journal_file_hmac_setup(f);
2690         if (r < 0)
2691                 goto fail;
2692 #endif
2693
2694         if (newly_created) {
2695                 r = journal_file_setup_field_hash_table(f);
2696                 if (r < 0)
2697                         goto fail;
2698
2699                 r = journal_file_setup_data_hash_table(f);
2700                 if (r < 0)
2701                         goto fail;
2702
2703 #ifdef HAVE_GCRYPT
2704                 r = journal_file_append_first_tag(f);
2705                 if (r < 0)
2706                         goto fail;
2707 #endif
2708         }
2709
2710         r = journal_file_map_field_hash_table(f);
2711         if (r < 0)
2712                 goto fail;
2713
2714         r = journal_file_map_data_hash_table(f);
2715         if (r < 0)
2716                 goto fail;
2717
2718         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2719                 r = -EIO;
2720                 goto fail;
2721         }
2722
2723         *ret = f;
2724         return 0;
2725
2726 fail:
2727         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2728                 r = -EIO;
2729
2730         journal_file_close(f);
2731
2732         return r;
2733 }
2734
2735 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2736         _cleanup_free_ char *p = NULL;
2737         size_t l;
2738         JournalFile *old_file, *new_file = NULL;
2739         int r;
2740
2741         assert(f);
2742         assert(*f);
2743
2744         old_file = *f;
2745
2746         if (!old_file->writable)
2747                 return -EINVAL;
2748
2749         if (!endswith(old_file->path, ".journal"))
2750                 return -EINVAL;
2751
2752         l = strlen(old_file->path);
2753         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2754                      (int) l - 8, old_file->path,
2755                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2756                      le64toh((*f)->header->head_entry_seqnum),
2757                      le64toh((*f)->header->head_entry_realtime));
2758         if (r < 0)
2759                 return -ENOMEM;
2760
2761         /* Try to rename the file to the archived version. If the file
2762          * already was deleted, we'll get ENOENT, let's ignore that
2763          * case. */
2764         r = rename(old_file->path, p);
2765         if (r < 0 && errno != ENOENT)
2766                 return -errno;
2767
2768         old_file->header->state = STATE_ARCHIVED;
2769
2770         /* Currently, btrfs is not very good with out write patterns
2771          * and fragments heavily. Let's defrag our journal files when
2772          * we archive them */
2773         old_file->defrag_on_close = true;
2774
2775         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2776         journal_file_close(old_file);
2777
2778         *f = new_file;
2779         return r;
2780 }
2781
2782 int journal_file_open_reliably(
2783                 const char *fname,
2784                 int flags,
2785                 mode_t mode,
2786                 bool compress,
2787                 bool seal,
2788                 JournalMetrics *metrics,
2789                 MMapCache *mmap_cache,
2790                 JournalFile *template,
2791                 JournalFile **ret) {
2792
2793         int r;
2794         size_t l;
2795         _cleanup_free_ char *p = NULL;
2796
2797         r = journal_file_open(fname, flags, mode, compress, seal,
2798                               metrics, mmap_cache, template, ret);
2799         if (r != -EBADMSG && /* corrupted */
2800             r != -ENODATA && /* truncated */
2801             r != -EHOSTDOWN && /* other machine */
2802             r != -EPROTONOSUPPORT && /* incompatible feature */
2803             r != -EBUSY && /* unclean shutdown */
2804             r != -ESHUTDOWN && /* already archived */
2805             r != -EIO && /* IO error, including SIGBUS on mmap */
2806             r != -EIDRM /* File has been deleted */)
2807                 return r;
2808
2809         if ((flags & O_ACCMODE) == O_RDONLY)
2810                 return r;
2811
2812         if (!(flags & O_CREAT))
2813                 return r;
2814
2815         if (!endswith(fname, ".journal"))
2816                 return r;
2817
2818         /* The file is corrupted. Rotate it away and try it again (but only once) */
2819
2820         l = strlen(fname);
2821         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2822                      (int) l - 8, fname,
2823                      (unsigned long long) now(CLOCK_REALTIME),
2824                      random_u64()) < 0)
2825                 return -ENOMEM;
2826
2827         r = rename(fname, p);
2828         if (r < 0)
2829                 return -errno;
2830
2831         /* btrfs doesn't cope well with our write pattern and
2832          * fragments heavily. Let's defrag all files we rotate */
2833
2834         (void) chattr_path(p, false, FS_NOCOW_FL);
2835         (void) btrfs_defrag(p);
2836
2837         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2838
2839         return journal_file_open(fname, flags, mode, compress, seal,
2840                                  metrics, mmap_cache, template, ret);
2841 }
2842
2843 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2844         uint64_t i, n;
2845         uint64_t q, xor_hash = 0;
2846         int r;
2847         EntryItem *items;
2848         dual_timestamp ts;
2849
2850         assert(from);
2851         assert(to);
2852         assert(o);
2853         assert(p);
2854
2855         if (!to->writable)
2856                 return -EPERM;
2857
2858         ts.monotonic = le64toh(o->entry.monotonic);
2859         ts.realtime = le64toh(o->entry.realtime);
2860
2861         n = journal_file_entry_n_items(o);
2862         /* alloca() can't take 0, hence let's allocate at least one */
2863         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2864
2865         for (i = 0; i < n; i++) {
2866                 uint64_t l, h;
2867                 le64_t le_hash;
2868                 size_t t;
2869                 void *data;
2870                 Object *u;
2871
2872                 q = le64toh(o->entry.items[i].object_offset);
2873                 le_hash = o->entry.items[i].hash;
2874
2875                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2876                 if (r < 0)
2877                         return r;
2878
2879                 if (le_hash != o->data.hash)
2880                         return -EBADMSG;
2881
2882                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2883                 t = (size_t) l;
2884
2885                 /* We hit the limit on 32bit machines */
2886                 if ((uint64_t) t != l)
2887                         return -E2BIG;
2888
2889                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2890 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2891                         size_t rsize;
2892
2893                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2894                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2895                         if (r < 0)
2896                                 return r;
2897
2898                         data = from->compress_buffer;
2899                         l = rsize;
2900 #else
2901                         return -EPROTONOSUPPORT;
2902 #endif
2903                 } else
2904                         data = o->data.payload;
2905
2906                 r = journal_file_append_data(to, data, l, &u, &h);
2907                 if (r < 0)
2908                         return r;
2909
2910                 xor_hash ^= le64toh(u->data.hash);
2911                 items[i].object_offset = htole64(h);
2912                 items[i].hash = u->data.hash;
2913
2914                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2915                 if (r < 0)
2916                         return r;
2917         }
2918
2919         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2920
2921         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2922                 return -EIO;
2923
2924         return r;
2925 }
2926
2927 void journal_default_metrics(JournalMetrics *m, int fd) {
2928         uint64_t fs_size = 0;
2929         struct statvfs ss;
2930         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2931
2932         assert(m);
2933         assert(fd >= 0);
2934
2935         if (fstatvfs(fd, &ss) >= 0)
2936                 fs_size = ss.f_frsize * ss.f_blocks;
2937
2938         if (m->max_use == (uint64_t) -1) {
2939
2940                 if (fs_size > 0) {
2941                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2942
2943                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2944                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2945
2946                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2947                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2948                 } else
2949                         m->max_use = DEFAULT_MAX_USE_LOWER;
2950         } else {
2951                 m->max_use = PAGE_ALIGN(m->max_use);
2952
2953                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2954                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2955         }
2956
2957         if (m->max_size == (uint64_t) -1) {
2958                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2959
2960                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2961                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2962         } else
2963                 m->max_size = PAGE_ALIGN(m->max_size);
2964
2965         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2966                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2967
2968         if (m->max_size*2 > m->max_use)
2969                 m->max_use = m->max_size*2;
2970
2971         if (m->min_size == (uint64_t) -1)
2972                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2973         else {
2974                 m->min_size = PAGE_ALIGN(m->min_size);
2975
2976                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2977                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2978
2979                 if (m->min_size > m->max_size)
2980                         m->max_size = m->min_size;
2981         }
2982
2983         if (m->keep_free == (uint64_t) -1) {
2984
2985                 if (fs_size > 0) {
2986                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2987
2988                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2989                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2990
2991                 } else
2992                         m->keep_free = DEFAULT_KEEP_FREE;
2993         }
2994
2995         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2996                   format_bytes(a, sizeof(a), m->max_use),
2997                   format_bytes(b, sizeof(b), m->max_size),
2998                   format_bytes(c, sizeof(c), m->min_size),
2999                   format_bytes(d, sizeof(d), m->keep_free));
3000 }
3001
3002 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3003         assert(f);
3004         assert(from || to);
3005
3006         if (from) {
3007                 if (f->header->head_entry_realtime == 0)
3008                         return -ENOENT;
3009
3010                 *from = le64toh(f->header->head_entry_realtime);
3011         }
3012
3013         if (to) {
3014                 if (f->header->tail_entry_realtime == 0)
3015                         return -ENOENT;
3016
3017                 *to = le64toh(f->header->tail_entry_realtime);
3018         }
3019
3020         return 1;
3021 }
3022
3023 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3024         Object *o;
3025         uint64_t p;
3026         int r;
3027
3028         assert(f);
3029         assert(from || to);
3030
3031         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3032         if (r <= 0)
3033                 return r;
3034
3035         if (le64toh(o->data.n_entries) <= 0)
3036                 return 0;
3037
3038         if (from) {
3039                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3040                 if (r < 0)
3041                         return r;
3042
3043                 *from = le64toh(o->entry.monotonic);
3044         }
3045
3046         if (to) {
3047                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3048                 if (r < 0)
3049                         return r;
3050
3051                 r = generic_array_get_plus_one(f,
3052                                                le64toh(o->data.entry_offset),
3053                                                le64toh(o->data.entry_array_offset),
3054                                                le64toh(o->data.n_entries)-1,
3055                                                &o, NULL);
3056                 if (r <= 0)
3057                         return r;
3058
3059                 *to = le64toh(o->entry.monotonic);
3060         }
3061
3062         return 1;
3063 }
3064
3065 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3066         assert(f);
3067
3068         /* If we gained new header fields we gained new features,
3069          * hence suggest a rotation */
3070         if (le64toh(f->header->header_size) < sizeof(Header)) {
3071                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3072                 return true;
3073         }
3074
3075         /* Let's check if the hash tables grew over a certain fill
3076          * level (75%, borrowing this value from Java's hash table
3077          * implementation), and if so suggest a rotation. To calculate
3078          * the fill level we need the n_data field, which only exists
3079          * in newer versions. */
3080
3081         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3082                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3083                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3084                                   f->path,
3085                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3086                                   le64toh(f->header->n_data),
3087                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3088                                   (unsigned long long) f->last_stat.st_size,
3089                                   f->last_stat.st_size / le64toh(f->header->n_data));
3090                         return true;
3091                 }
3092
3093         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3094                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3095                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3096                                   f->path,
3097                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3098                                   le64toh(f->header->n_fields),
3099                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3100                         return true;
3101                 }
3102
3103         /* Are the data objects properly indexed by field objects? */
3104         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3105             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3106             le64toh(f->header->n_data) > 0 &&
3107             le64toh(f->header->n_fields) == 0)
3108                 return true;
3109
3110         if (max_file_usec > 0) {
3111                 usec_t t, h;
3112
3113                 h = le64toh(f->header->head_entry_realtime);
3114                 t = now(CLOCK_REALTIME);
3115
3116                 if (h > 0 && t > h + max_file_usec)
3117                         return true;
3118         }
3119
3120         return false;
3121 }