chiark / gitweb /
2845e05ce0adcb7333e71c3c85cd5433a6727d12
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <linux/fs.h>
30
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
35 #include "lookup3.h"
36 #include "compress.h"
37 #include "fsprg.h"
38
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41
42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
46
47 /* These are the lower and upper bounds if we deduce the max_use value
48  * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
51
52 /* This is the upper bound if we deduce max_size from max_use */
53 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
54
55 /* This is the upper bound if we deduce the keep_free value from the
56  * file system size */
57 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
59 /* This is the keep_free value when we can't determine the system
60  * size */
61 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
62
63 /* n_data was the first entry we added after the initial file format design */
64 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65
66 /* How many entries to keep in the entry array chain cache at max */
67 #define CHAIN_CACHE_MAX 20
68
69 /* How much to increase the journal file size at once each time we allocate something new. */
70 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
71
72 /* Reread fstat() of the file for detecting deletions at least this often */
73 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74
75 /* The mmap context to use for the header we pick as one above the last defined typed */
76 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
77
78 static int journal_file_set_online(JournalFile *f) {
79         assert(f);
80
81         if (!f->writable)
82                 return -EPERM;
83
84         if (!(f->fd >= 0 && f->header))
85                 return -EINVAL;
86
87         if (mmap_cache_got_sigbus(f->mmap, f->fd))
88                 return -EIO;
89
90         switch(f->header->state) {
91                 case STATE_ONLINE:
92                         return 0;
93
94                 case STATE_OFFLINE:
95                         f->header->state = STATE_ONLINE;
96                         fsync(f->fd);
97                         return 0;
98
99                 default:
100                         return -EINVAL;
101         }
102 }
103
104 int journal_file_set_offline(JournalFile *f) {
105         assert(f);
106
107         if (!f->writable)
108                 return -EPERM;
109
110         if (!(f->fd >= 0 && f->header))
111                 return -EINVAL;
112
113         if (f->header->state != STATE_ONLINE)
114                 return 0;
115
116         fsync(f->fd);
117
118         if (mmap_cache_got_sigbus(f->mmap, f->fd))
119                 return -EIO;
120
121         f->header->state = STATE_OFFLINE;
122
123         if (mmap_cache_got_sigbus(f->mmap, f->fd))
124                 return -EIO;
125
126         fsync(f->fd);
127
128         return 0;
129 }
130
131 void journal_file_close(JournalFile *f) {
132         assert(f);
133
134 #ifdef HAVE_GCRYPT
135         /* Write the final tag */
136         if (f->seal && f->writable)
137                 journal_file_append_tag(f);
138 #endif
139
140         journal_file_set_offline(f);
141
142         if (f->mmap && f->fd >= 0)
143                 mmap_cache_close_fd(f->mmap, f->fd);
144
145         if (f->fd >= 0 && f->defrag_on_close) {
146
147                 /* Be friendly to btrfs: turn COW back on again now,
148                  * and defragment the file. We won't write to the file
149                  * ever again, hence remove all fragmentation, and
150                  * reenable all the good bits COW usually provides
151                  * (such as data checksumming). */
152
153                 (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
154                 (void) btrfs_defrag_fd(f->fd);
155         }
156
157         safe_close(f->fd);
158         free(f->path);
159
160         if (f->mmap)
161                 mmap_cache_unref(f->mmap);
162
163         ordered_hashmap_free_free(f->chain_cache);
164
165 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
166         free(f->compress_buffer);
167 #endif
168
169 #ifdef HAVE_GCRYPT
170         if (f->fss_file)
171                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
172         else if (f->fsprg_state)
173                 free(f->fsprg_state);
174
175         free(f->fsprg_seed);
176
177         if (f->hmac)
178                 gcry_md_close(f->hmac);
179 #endif
180
181         free(f);
182 }
183
184 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
185         Header h = {};
186         ssize_t k;
187         int r;
188
189         assert(f);
190
191         memcpy(h.signature, HEADER_SIGNATURE, 8);
192         h.header_size = htole64(ALIGN64(sizeof(h)));
193
194         h.incompatible_flags |= htole32(
195                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
196                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
197
198         h.compatible_flags = htole32(
199                 f->seal * HEADER_COMPATIBLE_SEALED);
200
201         r = sd_id128_randomize(&h.file_id);
202         if (r < 0)
203                 return r;
204
205         if (template) {
206                 h.seqnum_id = template->header->seqnum_id;
207                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
208         } else
209                 h.seqnum_id = h.file_id;
210
211         k = pwrite(f->fd, &h, sizeof(h), 0);
212         if (k < 0)
213                 return -errno;
214
215         if (k != sizeof(h))
216                 return -EIO;
217
218         return 0;
219 }
220
221 static int journal_file_refresh_header(JournalFile *f) {
222         sd_id128_t boot_id;
223         int r;
224
225         assert(f);
226
227         r = sd_id128_get_machine(&f->header->machine_id);
228         if (r < 0)
229                 return r;
230
231         r = sd_id128_get_boot(&boot_id);
232         if (r < 0)
233                 return r;
234
235         if (sd_id128_equal(boot_id, f->header->boot_id))
236                 f->tail_entry_monotonic_valid = true;
237
238         f->header->boot_id = boot_id;
239
240         r = journal_file_set_online(f);
241
242         /* Sync the online state to disk */
243         fsync(f->fd);
244
245         return r;
246 }
247
248 static int journal_file_verify_header(JournalFile *f) {
249         uint32_t flags;
250
251         assert(f);
252
253         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
254                 return -EBADMSG;
255
256         /* In both read and write mode we refuse to open files with
257          * incompatible flags we don't know */
258         flags = le32toh(f->header->incompatible_flags);
259         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
260                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
261                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
262                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
263                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
264                 if (flags)
265                         log_debug("Journal file %s uses incompatible flags %"PRIx32
266                                   " disabled at compilation time.", f->path, flags);
267                 return -EPROTONOSUPPORT;
268         }
269
270         /* When open for writing we refuse to open files with
271          * compatible flags, too */
272         flags = le32toh(f->header->compatible_flags);
273         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
274                 if (flags & ~HEADER_COMPATIBLE_ANY)
275                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
276                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
277                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
278                 if (flags)
279                         log_debug("Journal file %s uses compatible flags %"PRIx32
280                                   " disabled at compilation time.", f->path, flags);
281                 return -EPROTONOSUPPORT;
282         }
283
284         if (f->header->state >= _STATE_MAX)
285                 return -EBADMSG;
286
287         /* The first addition was n_data, so check that we are at least this large */
288         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
289                 return -EBADMSG;
290
291         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
292                 return -EBADMSG;
293
294         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
295                 return -ENODATA;
296
297         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
298                 return -ENODATA;
299
300         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
301             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
302             !VALID64(le64toh(f->header->tail_object_offset)) ||
303             !VALID64(le64toh(f->header->entry_array_offset)))
304                 return -ENODATA;
305
306         if (f->writable) {
307                 uint8_t state;
308                 sd_id128_t machine_id;
309                 int r;
310
311                 r = sd_id128_get_machine(&machine_id);
312                 if (r < 0)
313                         return r;
314
315                 if (!sd_id128_equal(machine_id, f->header->machine_id))
316                         return -EHOSTDOWN;
317
318                 state = f->header->state;
319
320                 if (state == STATE_ONLINE) {
321                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
322                         return -EBUSY;
323                 } else if (state == STATE_ARCHIVED)
324                         return -ESHUTDOWN;
325                 else if (state != STATE_OFFLINE) {
326                         log_debug("Journal file %s has unknown state %i.", f->path, state);
327                         return -EBUSY;
328                 }
329         }
330
331         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
332         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
333
334         f->seal = JOURNAL_HEADER_SEALED(f->header);
335
336         return 0;
337 }
338
339 static int journal_file_fstat(JournalFile *f) {
340         assert(f);
341         assert(f->fd >= 0);
342
343         if (fstat(f->fd, &f->last_stat) < 0)
344                 return -errno;
345
346         f->last_stat_usec = now(CLOCK_MONOTONIC);
347
348         /* Refuse appending to files that are already deleted */
349         if (f->last_stat.st_nlink <= 0)
350                 return -EIDRM;
351
352         return 0;
353 }
354
355 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
356         uint64_t old_size, new_size;
357         int r;
358
359         assert(f);
360
361         /* We assume that this file is not sparse, and we know that
362          * for sure, since we always call posix_fallocate()
363          * ourselves */
364
365         if (mmap_cache_got_sigbus(f->mmap, f->fd))
366                 return -EIO;
367
368         old_size =
369                 le64toh(f->header->header_size) +
370                 le64toh(f->header->arena_size);
371
372         new_size = PAGE_ALIGN(offset + size);
373         if (new_size < le64toh(f->header->header_size))
374                 new_size = le64toh(f->header->header_size);
375
376         if (new_size <= old_size) {
377
378                 /* We already pre-allocated enough space, but before
379                  * we write to it, let's check with fstat() if the
380                  * file got deleted, in order make sure we don't throw
381                  * away the data immediately. Don't check fstat() for
382                  * all writes though, but only once ever 10s. */
383
384                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
385                         return 0;
386
387                 return journal_file_fstat(f);
388         }
389
390         /* Allocate more space. */
391
392         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
393                 return -E2BIG;
394
395         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
396                 struct statvfs svfs;
397
398                 if (fstatvfs(f->fd, &svfs) >= 0) {
399                         uint64_t available;
400
401                         available = svfs.f_bfree * svfs.f_bsize;
402
403                         if (available >= f->metrics.keep_free)
404                                 available -= f->metrics.keep_free;
405                         else
406                                 available = 0;
407
408                         if (new_size - old_size > available)
409                                 return -E2BIG;
410                 }
411         }
412
413         /* Increase by larger blocks at once */
414         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
415         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
416                 new_size = f->metrics.max_size;
417
418         /* Note that the glibc fallocate() fallback is very
419            inefficient, hence we try to minimize the allocation area
420            as we can. */
421         r = posix_fallocate(f->fd, old_size, new_size - old_size);
422         if (r != 0)
423                 return -r;
424
425         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
426
427         return journal_file_fstat(f);
428 }
429
430 static unsigned type_to_context(ObjectType type) {
431         /* One context for each type, plus one catch-all for the rest */
432         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
433         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
434         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
435 }
436
437 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
438         int r;
439
440         assert(f);
441         assert(ret);
442
443         if (size <= 0)
444                 return -EINVAL;
445
446         /* Avoid SIGBUS on invalid accesses */
447         if (offset + size > (uint64_t) f->last_stat.st_size) {
448                 /* Hmm, out of range? Let's refresh the fstat() data
449                  * first, before we trust that check. */
450
451                 r = journal_file_fstat(f);
452                 if (r < 0)
453                         return r;
454
455                 if (offset + size > (uint64_t) f->last_stat.st_size)
456                         return -EADDRNOTAVAIL;
457         }
458
459         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
460 }
461
462 static uint64_t minimum_header_size(Object *o) {
463
464         static const uint64_t table[] = {
465                 [OBJECT_DATA] = sizeof(DataObject),
466                 [OBJECT_FIELD] = sizeof(FieldObject),
467                 [OBJECT_ENTRY] = sizeof(EntryObject),
468                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
469                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
470                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
471                 [OBJECT_TAG] = sizeof(TagObject),
472         };
473
474         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
475                 return sizeof(ObjectHeader);
476
477         return table[o->object.type];
478 }
479
480 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
481         int r;
482         void *t;
483         Object *o;
484         uint64_t s;
485
486         assert(f);
487         assert(ret);
488
489         /* Objects may only be located at multiple of 64 bit */
490         if (!VALID64(offset))
491                 return -EFAULT;
492
493         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
494         if (r < 0)
495                 return r;
496
497         o = (Object*) t;
498         s = le64toh(o->object.size);
499
500         if (s < sizeof(ObjectHeader))
501                 return -EBADMSG;
502
503         if (o->object.type <= OBJECT_UNUSED)
504                 return -EBADMSG;
505
506         if (s < minimum_header_size(o))
507                 return -EBADMSG;
508
509         if (type > OBJECT_UNUSED && o->object.type != type)
510                 return -EBADMSG;
511
512         if (s > sizeof(ObjectHeader)) {
513                 r = journal_file_move_to(f, type, false, offset, s, &t);
514                 if (r < 0)
515                         return r;
516
517                 o = (Object*) t;
518         }
519
520         *ret = o;
521         return 0;
522 }
523
524 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
525         uint64_t r;
526
527         assert(f);
528
529         r = le64toh(f->header->tail_entry_seqnum) + 1;
530
531         if (seqnum) {
532                 /* If an external seqnum counter was passed, we update
533                  * both the local and the external one, and set it to
534                  * the maximum of both */
535
536                 if (*seqnum + 1 > r)
537                         r = *seqnum + 1;
538
539                 *seqnum = r;
540         }
541
542         f->header->tail_entry_seqnum = htole64(r);
543
544         if (f->header->head_entry_seqnum == 0)
545                 f->header->head_entry_seqnum = htole64(r);
546
547         return r;
548 }
549
550 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
551         int r;
552         uint64_t p;
553         Object *tail, *o;
554         void *t;
555
556         assert(f);
557         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
558         assert(size >= sizeof(ObjectHeader));
559         assert(offset);
560         assert(ret);
561
562         r = journal_file_set_online(f);
563         if (r < 0)
564                 return r;
565
566         p = le64toh(f->header->tail_object_offset);
567         if (p == 0)
568                 p = le64toh(f->header->header_size);
569         else {
570                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
571                 if (r < 0)
572                         return r;
573
574                 p += ALIGN64(le64toh(tail->object.size));
575         }
576
577         r = journal_file_allocate(f, p, size);
578         if (r < 0)
579                 return r;
580
581         r = journal_file_move_to(f, type, false, p, size, &t);
582         if (r < 0)
583                 return r;
584
585         o = (Object*) t;
586
587         zero(o->object);
588         o->object.type = type;
589         o->object.size = htole64(size);
590
591         f->header->tail_object_offset = htole64(p);
592         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
593
594         *ret = o;
595         *offset = p;
596
597         return 0;
598 }
599
600 static int journal_file_setup_data_hash_table(JournalFile *f) {
601         uint64_t s, p;
602         Object *o;
603         int r;
604
605         assert(f);
606
607         /* We estimate that we need 1 hash table entry per 768 of
608            journal file and we want to make sure we never get beyond
609            75% fill level. Calculate the hash table size for the
610            maximum file size based on these metrics. */
611
612         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
613         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
614                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
615
616         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
617
618         r = journal_file_append_object(f,
619                                        OBJECT_DATA_HASH_TABLE,
620                                        offsetof(Object, hash_table.items) + s,
621                                        &o, &p);
622         if (r < 0)
623                 return r;
624
625         memzero(o->hash_table.items, s);
626
627         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
628         f->header->data_hash_table_size = htole64(s);
629
630         return 0;
631 }
632
633 static int journal_file_setup_field_hash_table(JournalFile *f) {
634         uint64_t s, p;
635         Object *o;
636         int r;
637
638         assert(f);
639
640         /* We use a fixed size hash table for the fields as this
641          * number should grow very slowly only */
642
643         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
644         r = journal_file_append_object(f,
645                                        OBJECT_FIELD_HASH_TABLE,
646                                        offsetof(Object, hash_table.items) + s,
647                                        &o, &p);
648         if (r < 0)
649                 return r;
650
651         memzero(o->hash_table.items, s);
652
653         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
654         f->header->field_hash_table_size = htole64(s);
655
656         return 0;
657 }
658
659 static int journal_file_map_data_hash_table(JournalFile *f) {
660         uint64_t s, p;
661         void *t;
662         int r;
663
664         assert(f);
665
666         p = le64toh(f->header->data_hash_table_offset);
667         s = le64toh(f->header->data_hash_table_size);
668
669         r = journal_file_move_to(f,
670                                  OBJECT_DATA_HASH_TABLE,
671                                  true,
672                                  p, s,
673                                  &t);
674         if (r < 0)
675                 return r;
676
677         f->data_hash_table = t;
678         return 0;
679 }
680
681 static int journal_file_map_field_hash_table(JournalFile *f) {
682         uint64_t s, p;
683         void *t;
684         int r;
685
686         assert(f);
687
688         p = le64toh(f->header->field_hash_table_offset);
689         s = le64toh(f->header->field_hash_table_size);
690
691         r = journal_file_move_to(f,
692                                  OBJECT_FIELD_HASH_TABLE,
693                                  true,
694                                  p, s,
695                                  &t);
696         if (r < 0)
697                 return r;
698
699         f->field_hash_table = t;
700         return 0;
701 }
702
703 static int journal_file_link_field(
704                 JournalFile *f,
705                 Object *o,
706                 uint64_t offset,
707                 uint64_t hash) {
708
709         uint64_t p, h, m;
710         int r;
711
712         assert(f);
713         assert(o);
714         assert(offset > 0);
715
716         if (o->object.type != OBJECT_FIELD)
717                 return -EINVAL;
718
719         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
720         if (m <= 0)
721                 return -EBADMSG;
722
723         /* This might alter the window we are looking at */
724         o->field.next_hash_offset = o->field.head_data_offset = 0;
725
726         h = hash % m;
727         p = le64toh(f->field_hash_table[h].tail_hash_offset);
728         if (p == 0)
729                 f->field_hash_table[h].head_hash_offset = htole64(offset);
730         else {
731                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
732                 if (r < 0)
733                         return r;
734
735                 o->field.next_hash_offset = htole64(offset);
736         }
737
738         f->field_hash_table[h].tail_hash_offset = htole64(offset);
739
740         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
741                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
742
743         return 0;
744 }
745
746 static int journal_file_link_data(
747                 JournalFile *f,
748                 Object *o,
749                 uint64_t offset,
750                 uint64_t hash) {
751
752         uint64_t p, h, m;
753         int r;
754
755         assert(f);
756         assert(o);
757         assert(offset > 0);
758
759         if (o->object.type != OBJECT_DATA)
760                 return -EINVAL;
761
762         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
763         if (m <= 0)
764                 return -EBADMSG;
765
766         /* This might alter the window we are looking at */
767         o->data.next_hash_offset = o->data.next_field_offset = 0;
768         o->data.entry_offset = o->data.entry_array_offset = 0;
769         o->data.n_entries = 0;
770
771         h = hash % m;
772         p = le64toh(f->data_hash_table[h].tail_hash_offset);
773         if (p == 0)
774                 /* Only entry in the hash table is easy */
775                 f->data_hash_table[h].head_hash_offset = htole64(offset);
776         else {
777                 /* Move back to the previous data object, to patch in
778                  * pointer */
779
780                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
781                 if (r < 0)
782                         return r;
783
784                 o->data.next_hash_offset = htole64(offset);
785         }
786
787         f->data_hash_table[h].tail_hash_offset = htole64(offset);
788
789         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
790                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
791
792         return 0;
793 }
794
795 int journal_file_find_field_object_with_hash(
796                 JournalFile *f,
797                 const void *field, uint64_t size, uint64_t hash,
798                 Object **ret, uint64_t *offset) {
799
800         uint64_t p, osize, h, m;
801         int r;
802
803         assert(f);
804         assert(field && size > 0);
805
806         osize = offsetof(Object, field.payload) + size;
807
808         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
809
810         if (m <= 0)
811                 return -EBADMSG;
812
813         h = hash % m;
814         p = le64toh(f->field_hash_table[h].head_hash_offset);
815
816         while (p > 0) {
817                 Object *o;
818
819                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
820                 if (r < 0)
821                         return r;
822
823                 if (le64toh(o->field.hash) == hash &&
824                     le64toh(o->object.size) == osize &&
825                     memcmp(o->field.payload, field, size) == 0) {
826
827                         if (ret)
828                                 *ret = o;
829                         if (offset)
830                                 *offset = p;
831
832                         return 1;
833                 }
834
835                 p = le64toh(o->field.next_hash_offset);
836         }
837
838         return 0;
839 }
840
841 int journal_file_find_field_object(
842                 JournalFile *f,
843                 const void *field, uint64_t size,
844                 Object **ret, uint64_t *offset) {
845
846         uint64_t hash;
847
848         assert(f);
849         assert(field && size > 0);
850
851         hash = hash64(field, size);
852
853         return journal_file_find_field_object_with_hash(f,
854                                                         field, size, hash,
855                                                         ret, offset);
856 }
857
858 int journal_file_find_data_object_with_hash(
859                 JournalFile *f,
860                 const void *data, uint64_t size, uint64_t hash,
861                 Object **ret, uint64_t *offset) {
862
863         uint64_t p, osize, h, m;
864         int r;
865
866         assert(f);
867         assert(data || size == 0);
868
869         osize = offsetof(Object, data.payload) + size;
870
871         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
872         if (m <= 0)
873                 return -EBADMSG;
874
875         h = hash % m;
876         p = le64toh(f->data_hash_table[h].head_hash_offset);
877
878         while (p > 0) {
879                 Object *o;
880
881                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
882                 if (r < 0)
883                         return r;
884
885                 if (le64toh(o->data.hash) != hash)
886                         goto next;
887
888                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
889 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
890                         uint64_t l;
891                         size_t rsize;
892
893                         l = le64toh(o->object.size);
894                         if (l <= offsetof(Object, data.payload))
895                                 return -EBADMSG;
896
897                         l -= offsetof(Object, data.payload);
898
899                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
900                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
901                         if (r < 0)
902                                 return r;
903
904                         if (rsize == size &&
905                             memcmp(f->compress_buffer, data, size) == 0) {
906
907                                 if (ret)
908                                         *ret = o;
909
910                                 if (offset)
911                                         *offset = p;
912
913                                 return 1;
914                         }
915 #else
916                         return -EPROTONOSUPPORT;
917 #endif
918                 } else if (le64toh(o->object.size) == osize &&
919                            memcmp(o->data.payload, data, size) == 0) {
920
921                         if (ret)
922                                 *ret = o;
923
924                         if (offset)
925                                 *offset = p;
926
927                         return 1;
928                 }
929
930         next:
931                 p = le64toh(o->data.next_hash_offset);
932         }
933
934         return 0;
935 }
936
937 int journal_file_find_data_object(
938                 JournalFile *f,
939                 const void *data, uint64_t size,
940                 Object **ret, uint64_t *offset) {
941
942         uint64_t hash;
943
944         assert(f);
945         assert(data || size == 0);
946
947         hash = hash64(data, size);
948
949         return journal_file_find_data_object_with_hash(f,
950                                                        data, size, hash,
951                                                        ret, offset);
952 }
953
954 static int journal_file_append_field(
955                 JournalFile *f,
956                 const void *field, uint64_t size,
957                 Object **ret, uint64_t *offset) {
958
959         uint64_t hash, p;
960         uint64_t osize;
961         Object *o;
962         int r;
963
964         assert(f);
965         assert(field && size > 0);
966
967         hash = hash64(field, size);
968
969         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
970         if (r < 0)
971                 return r;
972         else if (r > 0) {
973
974                 if (ret)
975                         *ret = o;
976
977                 if (offset)
978                         *offset = p;
979
980                 return 0;
981         }
982
983         osize = offsetof(Object, field.payload) + size;
984         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
985         if (r < 0)
986                 return r;
987
988         o->field.hash = htole64(hash);
989         memcpy(o->field.payload, field, size);
990
991         r = journal_file_link_field(f, o, p, hash);
992         if (r < 0)
993                 return r;
994
995         /* The linking might have altered the window, so let's
996          * refresh our pointer */
997         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
998         if (r < 0)
999                 return r;
1000
1001 #ifdef HAVE_GCRYPT
1002         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1003         if (r < 0)
1004                 return r;
1005 #endif
1006
1007         if (ret)
1008                 *ret = o;
1009
1010         if (offset)
1011                 *offset = p;
1012
1013         return 0;
1014 }
1015
1016 static int journal_file_append_data(
1017                 JournalFile *f,
1018                 const void *data, uint64_t size,
1019                 Object **ret, uint64_t *offset) {
1020
1021         uint64_t hash, p;
1022         uint64_t osize;
1023         Object *o;
1024         int r, compression = 0;
1025         const void *eq;
1026
1027         assert(f);
1028         assert(data || size == 0);
1029
1030         hash = hash64(data, size);
1031
1032         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1033         if (r < 0)
1034                 return r;
1035         else if (r > 0) {
1036
1037                 if (ret)
1038                         *ret = o;
1039
1040                 if (offset)
1041                         *offset = p;
1042
1043                 return 0;
1044         }
1045
1046         osize = offsetof(Object, data.payload) + size;
1047         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1048         if (r < 0)
1049                 return r;
1050
1051         o->data.hash = htole64(hash);
1052
1053 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1054         if (f->compress_xz &&
1055             size >= COMPRESSION_SIZE_THRESHOLD) {
1056                 size_t rsize;
1057
1058                 compression = compress_blob(data, size, o->data.payload, &rsize);
1059
1060                 if (compression) {
1061                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1062                         o->object.flags |= compression;
1063
1064                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1065                                   size, rsize, object_compressed_to_string(compression));
1066                 }
1067         }
1068 #endif
1069
1070         if (!compression && size > 0)
1071                 memcpy(o->data.payload, data, size);
1072
1073         r = journal_file_link_data(f, o, p, hash);
1074         if (r < 0)
1075                 return r;
1076
1077         /* The linking might have altered the window, so let's
1078          * refresh our pointer */
1079         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1080         if (r < 0)
1081                 return r;
1082
1083         if (!data)
1084                 eq = NULL;
1085         else
1086                 eq = memchr(data, '=', size);
1087         if (eq && eq > data) {
1088                 Object *fo = NULL;
1089                 uint64_t fp;
1090
1091                 /* Create field object ... */
1092                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1093                 if (r < 0)
1094                         return r;
1095
1096                 /* ... and link it in. */
1097                 o->data.next_field_offset = fo->field.head_data_offset;
1098                 fo->field.head_data_offset = le64toh(p);
1099         }
1100
1101 #ifdef HAVE_GCRYPT
1102         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1103         if (r < 0)
1104                 return r;
1105 #endif
1106
1107         if (ret)
1108                 *ret = o;
1109
1110         if (offset)
1111                 *offset = p;
1112
1113         return 0;
1114 }
1115
1116 uint64_t journal_file_entry_n_items(Object *o) {
1117         assert(o);
1118
1119         if (o->object.type != OBJECT_ENTRY)
1120                 return 0;
1121
1122         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1123 }
1124
1125 uint64_t journal_file_entry_array_n_items(Object *o) {
1126         assert(o);
1127
1128         if (o->object.type != OBJECT_ENTRY_ARRAY)
1129                 return 0;
1130
1131         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1132 }
1133
1134 uint64_t journal_file_hash_table_n_items(Object *o) {
1135         assert(o);
1136
1137         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1138             o->object.type != OBJECT_FIELD_HASH_TABLE)
1139                 return 0;
1140
1141         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1142 }
1143
1144 static int link_entry_into_array(JournalFile *f,
1145                                  le64_t *first,
1146                                  le64_t *idx,
1147                                  uint64_t p) {
1148         int r;
1149         uint64_t n = 0, ap = 0, q, i, a, hidx;
1150         Object *o;
1151
1152         assert(f);
1153         assert(first);
1154         assert(idx);
1155         assert(p > 0);
1156
1157         a = le64toh(*first);
1158         i = hidx = le64toh(*idx);
1159         while (a > 0) {
1160
1161                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1162                 if (r < 0)
1163                         return r;
1164
1165                 n = journal_file_entry_array_n_items(o);
1166                 if (i < n) {
1167                         o->entry_array.items[i] = htole64(p);
1168                         *idx = htole64(hidx + 1);
1169                         return 0;
1170                 }
1171
1172                 i -= n;
1173                 ap = a;
1174                 a = le64toh(o->entry_array.next_entry_array_offset);
1175         }
1176
1177         if (hidx > n)
1178                 n = (hidx+1) * 2;
1179         else
1180                 n = n * 2;
1181
1182         if (n < 4)
1183                 n = 4;
1184
1185         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1186                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1187                                        &o, &q);
1188         if (r < 0)
1189                 return r;
1190
1191 #ifdef HAVE_GCRYPT
1192         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1193         if (r < 0)
1194                 return r;
1195 #endif
1196
1197         o->entry_array.items[i] = htole64(p);
1198
1199         if (ap == 0)
1200                 *first = htole64(q);
1201         else {
1202                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1203                 if (r < 0)
1204                         return r;
1205
1206                 o->entry_array.next_entry_array_offset = htole64(q);
1207         }
1208
1209         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1210                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1211
1212         *idx = htole64(hidx + 1);
1213
1214         return 0;
1215 }
1216
1217 static int link_entry_into_array_plus_one(JournalFile *f,
1218                                           le64_t *extra,
1219                                           le64_t *first,
1220                                           le64_t *idx,
1221                                           uint64_t p) {
1222
1223         int r;
1224
1225         assert(f);
1226         assert(extra);
1227         assert(first);
1228         assert(idx);
1229         assert(p > 0);
1230
1231         if (*idx == 0)
1232                 *extra = htole64(p);
1233         else {
1234                 le64_t i;
1235
1236                 i = htole64(le64toh(*idx) - 1);
1237                 r = link_entry_into_array(f, first, &i, p);
1238                 if (r < 0)
1239                         return r;
1240         }
1241
1242         *idx = htole64(le64toh(*idx) + 1);
1243         return 0;
1244 }
1245
1246 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1247         uint64_t p;
1248         int r;
1249         assert(f);
1250         assert(o);
1251         assert(offset > 0);
1252
1253         p = le64toh(o->entry.items[i].object_offset);
1254         if (p == 0)
1255                 return -EINVAL;
1256
1257         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1258         if (r < 0)
1259                 return r;
1260
1261         return link_entry_into_array_plus_one(f,
1262                                               &o->data.entry_offset,
1263                                               &o->data.entry_array_offset,
1264                                               &o->data.n_entries,
1265                                               offset);
1266 }
1267
1268 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1269         uint64_t n, i;
1270         int r;
1271
1272         assert(f);
1273         assert(o);
1274         assert(offset > 0);
1275
1276         if (o->object.type != OBJECT_ENTRY)
1277                 return -EINVAL;
1278
1279         __sync_synchronize();
1280
1281         /* Link up the entry itself */
1282         r = link_entry_into_array(f,
1283                                   &f->header->entry_array_offset,
1284                                   &f->header->n_entries,
1285                                   offset);
1286         if (r < 0)
1287                 return r;
1288
1289         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1290
1291         if (f->header->head_entry_realtime == 0)
1292                 f->header->head_entry_realtime = o->entry.realtime;
1293
1294         f->header->tail_entry_realtime = o->entry.realtime;
1295         f->header->tail_entry_monotonic = o->entry.monotonic;
1296
1297         f->tail_entry_monotonic_valid = true;
1298
1299         /* Link up the items */
1300         n = journal_file_entry_n_items(o);
1301         for (i = 0; i < n; i++) {
1302                 r = journal_file_link_entry_item(f, o, offset, i);
1303                 if (r < 0)
1304                         return r;
1305         }
1306
1307         return 0;
1308 }
1309
1310 static int journal_file_append_entry_internal(
1311                 JournalFile *f,
1312                 const dual_timestamp *ts,
1313                 uint64_t xor_hash,
1314                 const EntryItem items[], unsigned n_items,
1315                 uint64_t *seqnum,
1316                 Object **ret, uint64_t *offset) {
1317         uint64_t np;
1318         uint64_t osize;
1319         Object *o;
1320         int r;
1321
1322         assert(f);
1323         assert(items || n_items == 0);
1324         assert(ts);
1325
1326         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1327
1328         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1329         if (r < 0)
1330                 return r;
1331
1332         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1333         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1334         o->entry.realtime = htole64(ts->realtime);
1335         o->entry.monotonic = htole64(ts->monotonic);
1336         o->entry.xor_hash = htole64(xor_hash);
1337         o->entry.boot_id = f->header->boot_id;
1338
1339 #ifdef HAVE_GCRYPT
1340         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1341         if (r < 0)
1342                 return r;
1343 #endif
1344
1345         r = journal_file_link_entry(f, o, np);
1346         if (r < 0)
1347                 return r;
1348
1349         if (ret)
1350                 *ret = o;
1351
1352         if (offset)
1353                 *offset = np;
1354
1355         return 0;
1356 }
1357
1358 void journal_file_post_change(JournalFile *f) {
1359         assert(f);
1360
1361         /* inotify() does not receive IN_MODIFY events from file
1362          * accesses done via mmap(). After each access we hence
1363          * trigger IN_MODIFY by truncating the journal file to its
1364          * current size which triggers IN_MODIFY. */
1365
1366         __sync_synchronize();
1367
1368         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1369                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1370 }
1371
1372 static int entry_item_cmp(const void *_a, const void *_b) {
1373         const EntryItem *a = _a, *b = _b;
1374
1375         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1376                 return -1;
1377         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1378                 return 1;
1379         return 0;
1380 }
1381
1382 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1383         unsigned i;
1384         EntryItem *items;
1385         int r;
1386         uint64_t xor_hash = 0;
1387         struct dual_timestamp _ts;
1388
1389         assert(f);
1390         assert(iovec || n_iovec == 0);
1391
1392         if (!ts) {
1393                 dual_timestamp_get(&_ts);
1394                 ts = &_ts;
1395         }
1396
1397         if (f->tail_entry_monotonic_valid &&
1398             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1399                 return -EINVAL;
1400
1401 #ifdef HAVE_GCRYPT
1402         r = journal_file_maybe_append_tag(f, ts->realtime);
1403         if (r < 0)
1404                 return r;
1405 #endif
1406
1407         /* alloca() can't take 0, hence let's allocate at least one */
1408         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1409
1410         for (i = 0; i < n_iovec; i++) {
1411                 uint64_t p;
1412                 Object *o;
1413
1414                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1415                 if (r < 0)
1416                         return r;
1417
1418                 xor_hash ^= le64toh(o->data.hash);
1419                 items[i].object_offset = htole64(p);
1420                 items[i].hash = o->data.hash;
1421         }
1422
1423         /* Order by the position on disk, in order to improve seek
1424          * times for rotating media. */
1425         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1426
1427         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1428
1429         /* If the memory mapping triggered a SIGBUS then we return an
1430          * IO error and ignore the error code passed down to us, since
1431          * it is very likely just an effect of a nullified replacement
1432          * mapping page */
1433
1434         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1435                 r = -EIO;
1436
1437         journal_file_post_change(f);
1438
1439         return r;
1440 }
1441
1442 typedef struct ChainCacheItem {
1443         uint64_t first; /* the array at the beginning of the chain */
1444         uint64_t array; /* the cached array */
1445         uint64_t begin; /* the first item in the cached array */
1446         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1447         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1448 } ChainCacheItem;
1449
1450 static void chain_cache_put(
1451                 OrderedHashmap *h,
1452                 ChainCacheItem *ci,
1453                 uint64_t first,
1454                 uint64_t array,
1455                 uint64_t begin,
1456                 uint64_t total,
1457                 uint64_t last_index) {
1458
1459         if (!ci) {
1460                 /* If the chain item to cache for this chain is the
1461                  * first one it's not worth caching anything */
1462                 if (array == first)
1463                         return;
1464
1465                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1466                         ci = ordered_hashmap_steal_first(h);
1467                         assert(ci);
1468                 } else {
1469                         ci = new(ChainCacheItem, 1);
1470                         if (!ci)
1471                                 return;
1472                 }
1473
1474                 ci->first = first;
1475
1476                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1477                         free(ci);
1478                         return;
1479                 }
1480         } else
1481                 assert(ci->first == first);
1482
1483         ci->array = array;
1484         ci->begin = begin;
1485         ci->total = total;
1486         ci->last_index = last_index;
1487 }
1488
1489 static int generic_array_get(
1490                 JournalFile *f,
1491                 uint64_t first,
1492                 uint64_t i,
1493                 Object **ret, uint64_t *offset) {
1494
1495         Object *o;
1496         uint64_t p = 0, a, t = 0;
1497         int r;
1498         ChainCacheItem *ci;
1499
1500         assert(f);
1501
1502         a = first;
1503
1504         /* Try the chain cache first */
1505         ci = ordered_hashmap_get(f->chain_cache, &first);
1506         if (ci && i > ci->total) {
1507                 a = ci->array;
1508                 i -= ci->total;
1509                 t = ci->total;
1510         }
1511
1512         while (a > 0) {
1513                 uint64_t k;
1514
1515                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1516                 if (r < 0)
1517                         return r;
1518
1519                 k = journal_file_entry_array_n_items(o);
1520                 if (i < k) {
1521                         p = le64toh(o->entry_array.items[i]);
1522                         goto found;
1523                 }
1524
1525                 i -= k;
1526                 t += k;
1527                 a = le64toh(o->entry_array.next_entry_array_offset);
1528         }
1529
1530         return 0;
1531
1532 found:
1533         /* Let's cache this item for the next invocation */
1534         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1535
1536         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1537         if (r < 0)
1538                 return r;
1539
1540         if (ret)
1541                 *ret = o;
1542
1543         if (offset)
1544                 *offset = p;
1545
1546         return 1;
1547 }
1548
1549 static int generic_array_get_plus_one(
1550                 JournalFile *f,
1551                 uint64_t extra,
1552                 uint64_t first,
1553                 uint64_t i,
1554                 Object **ret, uint64_t *offset) {
1555
1556         Object *o;
1557
1558         assert(f);
1559
1560         if (i == 0) {
1561                 int r;
1562
1563                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1564                 if (r < 0)
1565                         return r;
1566
1567                 if (ret)
1568                         *ret = o;
1569
1570                 if (offset)
1571                         *offset = extra;
1572
1573                 return 1;
1574         }
1575
1576         return generic_array_get(f, first, i-1, ret, offset);
1577 }
1578
1579 enum {
1580         TEST_FOUND,
1581         TEST_LEFT,
1582         TEST_RIGHT
1583 };
1584
1585 static int generic_array_bisect(
1586                 JournalFile *f,
1587                 uint64_t first,
1588                 uint64_t n,
1589                 uint64_t needle,
1590                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1591                 direction_t direction,
1592                 Object **ret,
1593                 uint64_t *offset,
1594                 uint64_t *idx) {
1595
1596         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1597         bool subtract_one = false;
1598         Object *o, *array = NULL;
1599         int r;
1600         ChainCacheItem *ci;
1601
1602         assert(f);
1603         assert(test_object);
1604
1605         /* Start with the first array in the chain */
1606         a = first;
1607
1608         ci = ordered_hashmap_get(f->chain_cache, &first);
1609         if (ci && n > ci->total) {
1610                 /* Ah, we have iterated this bisection array chain
1611                  * previously! Let's see if we can skip ahead in the
1612                  * chain, as far as the last time. But we can't jump
1613                  * backwards in the chain, so let's check that
1614                  * first. */
1615
1616                 r = test_object(f, ci->begin, needle);
1617                 if (r < 0)
1618                         return r;
1619
1620                 if (r == TEST_LEFT) {
1621                         /* OK, what we are looking for is right of the
1622                          * begin of this EntryArray, so let's jump
1623                          * straight to previously cached array in the
1624                          * chain */
1625
1626                         a = ci->array;
1627                         n -= ci->total;
1628                         t = ci->total;
1629                         last_index = ci->last_index;
1630                 }
1631         }
1632
1633         while (a > 0) {
1634                 uint64_t left, right, k, lp;
1635
1636                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1637                 if (r < 0)
1638                         return r;
1639
1640                 k = journal_file_entry_array_n_items(array);
1641                 right = MIN(k, n);
1642                 if (right <= 0)
1643                         return 0;
1644
1645                 i = right - 1;
1646                 lp = p = le64toh(array->entry_array.items[i]);
1647                 if (p <= 0)
1648                         return -EBADMSG;
1649
1650                 r = test_object(f, p, needle);
1651                 if (r < 0)
1652                         return r;
1653
1654                 if (r == TEST_FOUND)
1655                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1656
1657                 if (r == TEST_RIGHT) {
1658                         left = 0;
1659                         right -= 1;
1660
1661                         if (last_index != (uint64_t) -1) {
1662                                 assert(last_index <= right);
1663
1664                                 /* If we cached the last index we
1665                                  * looked at, let's try to not to jump
1666                                  * too wildly around and see if we can
1667                                  * limit the range to look at early to
1668                                  * the immediate neighbors of the last
1669                                  * index we looked at. */
1670
1671                                 if (last_index > 0) {
1672                                         uint64_t x = last_index - 1;
1673
1674                                         p = le64toh(array->entry_array.items[x]);
1675                                         if (p <= 0)
1676                                                 return -EBADMSG;
1677
1678                                         r = test_object(f, p, needle);
1679                                         if (r < 0)
1680                                                 return r;
1681
1682                                         if (r == TEST_FOUND)
1683                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1684
1685                                         if (r == TEST_RIGHT)
1686                                                 right = x;
1687                                         else
1688                                                 left = x + 1;
1689                                 }
1690
1691                                 if (last_index < right) {
1692                                         uint64_t y = last_index + 1;
1693
1694                                         p = le64toh(array->entry_array.items[y]);
1695                                         if (p <= 0)
1696                                                 return -EBADMSG;
1697
1698                                         r = test_object(f, p, needle);
1699                                         if (r < 0)
1700                                                 return r;
1701
1702                                         if (r == TEST_FOUND)
1703                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1704
1705                                         if (r == TEST_RIGHT)
1706                                                 right = y;
1707                                         else
1708                                                 left = y + 1;
1709                                 }
1710                         }
1711
1712                         for (;;) {
1713                                 if (left == right) {
1714                                         if (direction == DIRECTION_UP)
1715                                                 subtract_one = true;
1716
1717                                         i = left;
1718                                         goto found;
1719                                 }
1720
1721                                 assert(left < right);
1722                                 i = (left + right) / 2;
1723
1724                                 p = le64toh(array->entry_array.items[i]);
1725                                 if (p <= 0)
1726                                         return -EBADMSG;
1727
1728                                 r = test_object(f, p, needle);
1729                                 if (r < 0)
1730                                         return r;
1731
1732                                 if (r == TEST_FOUND)
1733                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1734
1735                                 if (r == TEST_RIGHT)
1736                                         right = i;
1737                                 else
1738                                         left = i + 1;
1739                         }
1740                 }
1741
1742                 if (k >= n) {
1743                         if (direction == DIRECTION_UP) {
1744                                 i = n;
1745                                 subtract_one = true;
1746                                 goto found;
1747                         }
1748
1749                         return 0;
1750                 }
1751
1752                 last_p = lp;
1753
1754                 n -= k;
1755                 t += k;
1756                 last_index = (uint64_t) -1;
1757                 a = le64toh(array->entry_array.next_entry_array_offset);
1758         }
1759
1760         return 0;
1761
1762 found:
1763         if (subtract_one && t == 0 && i == 0)
1764                 return 0;
1765
1766         /* Let's cache this item for the next invocation */
1767         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1768
1769         if (subtract_one && i == 0)
1770                 p = last_p;
1771         else if (subtract_one)
1772                 p = le64toh(array->entry_array.items[i-1]);
1773         else
1774                 p = le64toh(array->entry_array.items[i]);
1775
1776         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1777         if (r < 0)
1778                 return r;
1779
1780         if (ret)
1781                 *ret = o;
1782
1783         if (offset)
1784                 *offset = p;
1785
1786         if (idx)
1787                 *idx = t + i + (subtract_one ? -1 : 0);
1788
1789         return 1;
1790 }
1791
1792 static int generic_array_bisect_plus_one(
1793                 JournalFile *f,
1794                 uint64_t extra,
1795                 uint64_t first,
1796                 uint64_t n,
1797                 uint64_t needle,
1798                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1799                 direction_t direction,
1800                 Object **ret,
1801                 uint64_t *offset,
1802                 uint64_t *idx) {
1803
1804         int r;
1805         bool step_back = false;
1806         Object *o;
1807
1808         assert(f);
1809         assert(test_object);
1810
1811         if (n <= 0)
1812                 return 0;
1813
1814         /* This bisects the array in object 'first', but first checks
1815          * an extra  */
1816         r = test_object(f, extra, needle);
1817         if (r < 0)
1818                 return r;
1819
1820         if (r == TEST_FOUND)
1821                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1822
1823         /* if we are looking with DIRECTION_UP then we need to first
1824            see if in the actual array there is a matching entry, and
1825            return the last one of that. But if there isn't any we need
1826            to return this one. Hence remember this, and return it
1827            below. */
1828         if (r == TEST_LEFT)
1829                 step_back = direction == DIRECTION_UP;
1830
1831         if (r == TEST_RIGHT) {
1832                 if (direction == DIRECTION_DOWN)
1833                         goto found;
1834                 else
1835                         return 0;
1836         }
1837
1838         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1839
1840         if (r == 0 && step_back)
1841                 goto found;
1842
1843         if (r > 0 && idx)
1844                 (*idx) ++;
1845
1846         return r;
1847
1848 found:
1849         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1850         if (r < 0)
1851                 return r;
1852
1853         if (ret)
1854                 *ret = o;
1855
1856         if (offset)
1857                 *offset = extra;
1858
1859         if (idx)
1860                 *idx = 0;
1861
1862         return 1;
1863 }
1864
1865 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1866         assert(f);
1867         assert(p > 0);
1868
1869         if (p == needle)
1870                 return TEST_FOUND;
1871         else if (p < needle)
1872                 return TEST_LEFT;
1873         else
1874                 return TEST_RIGHT;
1875 }
1876
1877 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1878         Object *o;
1879         int r;
1880
1881         assert(f);
1882         assert(p > 0);
1883
1884         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1885         if (r < 0)
1886                 return r;
1887
1888         if (le64toh(o->entry.seqnum) == needle)
1889                 return TEST_FOUND;
1890         else if (le64toh(o->entry.seqnum) < needle)
1891                 return TEST_LEFT;
1892         else
1893                 return TEST_RIGHT;
1894 }
1895
1896 int journal_file_move_to_entry_by_seqnum(
1897                 JournalFile *f,
1898                 uint64_t seqnum,
1899                 direction_t direction,
1900                 Object **ret,
1901                 uint64_t *offset) {
1902
1903         return generic_array_bisect(f,
1904                                     le64toh(f->header->entry_array_offset),
1905                                     le64toh(f->header->n_entries),
1906                                     seqnum,
1907                                     test_object_seqnum,
1908                                     direction,
1909                                     ret, offset, NULL);
1910 }
1911
1912 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1913         Object *o;
1914         int r;
1915
1916         assert(f);
1917         assert(p > 0);
1918
1919         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1920         if (r < 0)
1921                 return r;
1922
1923         if (le64toh(o->entry.realtime) == needle)
1924                 return TEST_FOUND;
1925         else if (le64toh(o->entry.realtime) < needle)
1926                 return TEST_LEFT;
1927         else
1928                 return TEST_RIGHT;
1929 }
1930
1931 int journal_file_move_to_entry_by_realtime(
1932                 JournalFile *f,
1933                 uint64_t realtime,
1934                 direction_t direction,
1935                 Object **ret,
1936                 uint64_t *offset) {
1937
1938         return generic_array_bisect(f,
1939                                     le64toh(f->header->entry_array_offset),
1940                                     le64toh(f->header->n_entries),
1941                                     realtime,
1942                                     test_object_realtime,
1943                                     direction,
1944                                     ret, offset, NULL);
1945 }
1946
1947 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1948         Object *o;
1949         int r;
1950
1951         assert(f);
1952         assert(p > 0);
1953
1954         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1955         if (r < 0)
1956                 return r;
1957
1958         if (le64toh(o->entry.monotonic) == needle)
1959                 return TEST_FOUND;
1960         else if (le64toh(o->entry.monotonic) < needle)
1961                 return TEST_LEFT;
1962         else
1963                 return TEST_RIGHT;
1964 }
1965
1966 static int find_data_object_by_boot_id(
1967                 JournalFile *f,
1968                 sd_id128_t boot_id,
1969                 Object **o,
1970                 uint64_t *b) {
1971
1972         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1973
1974         sd_id128_to_string(boot_id, t + 9);
1975         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1976 }
1977
1978 int journal_file_move_to_entry_by_monotonic(
1979                 JournalFile *f,
1980                 sd_id128_t boot_id,
1981                 uint64_t monotonic,
1982                 direction_t direction,
1983                 Object **ret,
1984                 uint64_t *offset) {
1985
1986         Object *o;
1987         int r;
1988
1989         assert(f);
1990
1991         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1992         if (r < 0)
1993                 return r;
1994         if (r == 0)
1995                 return -ENOENT;
1996
1997         return generic_array_bisect_plus_one(f,
1998                                              le64toh(o->data.entry_offset),
1999                                              le64toh(o->data.entry_array_offset),
2000                                              le64toh(o->data.n_entries),
2001                                              monotonic,
2002                                              test_object_monotonic,
2003                                              direction,
2004                                              ret, offset, NULL);
2005 }
2006
2007 void journal_file_reset_location(JournalFile *f) {
2008         f->location_type = LOCATION_HEAD;
2009         f->current_offset = 0;
2010         f->current_seqnum = 0;
2011         f->current_realtime = 0;
2012         f->current_monotonic = 0;
2013         zero(f->current_boot_id);
2014         f->current_xor_hash = 0;
2015 }
2016
2017 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2018         f->last_direction = direction;
2019         f->location_type = LOCATION_SEEK;
2020         f->current_offset = offset;
2021         f->current_seqnum = le64toh(o->entry.seqnum);
2022         f->current_realtime = le64toh(o->entry.realtime);
2023         f->current_monotonic = le64toh(o->entry.monotonic);
2024         f->current_boot_id = o->entry.boot_id;
2025         f->current_xor_hash = le64toh(o->entry.xor_hash);
2026 }
2027
2028 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2029         assert(af);
2030         assert(bf);
2031         assert(af->location_type == LOCATION_SEEK);
2032         assert(bf->location_type == LOCATION_SEEK);
2033
2034         /* If contents and timestamps match, these entries are
2035          * identical, even if the seqnum does not match */
2036         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2037             af->current_monotonic == bf->current_monotonic &&
2038             af->current_realtime == bf->current_realtime &&
2039             af->current_xor_hash == bf->current_xor_hash)
2040                 return 0;
2041
2042         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2043
2044                 /* If this is from the same seqnum source, compare
2045                  * seqnums */
2046                 if (af->current_seqnum < bf->current_seqnum)
2047                         return -1;
2048                 if (af->current_seqnum > bf->current_seqnum)
2049                         return 1;
2050
2051                 /* Wow! This is weird, different data but the same
2052                  * seqnums? Something is borked, but let's make the
2053                  * best of it and compare by time. */
2054         }
2055
2056         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2057
2058                 /* If the boot id matches, compare monotonic time */
2059                 if (af->current_monotonic < bf->current_monotonic)
2060                         return -1;
2061                 if (af->current_monotonic > bf->current_monotonic)
2062                         return 1;
2063         }
2064
2065         /* Otherwise, compare UTC time */
2066         if (af->current_realtime < bf->current_realtime)
2067                 return -1;
2068         if (af->current_realtime > bf->current_realtime)
2069                 return 1;
2070
2071         /* Finally, compare by contents */
2072         if (af->current_xor_hash < bf->current_xor_hash)
2073                 return -1;
2074         if (af->current_xor_hash > bf->current_xor_hash)
2075                 return 1;
2076
2077         return 0;
2078 }
2079
2080 int journal_file_next_entry(
2081                 JournalFile *f,
2082                 uint64_t p,
2083                 direction_t direction,
2084                 Object **ret, uint64_t *offset) {
2085
2086         uint64_t i, n, ofs;
2087         int r;
2088
2089         assert(f);
2090
2091         n = le64toh(f->header->n_entries);
2092         if (n <= 0)
2093                 return 0;
2094
2095         if (p == 0)
2096                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2097         else {
2098                 r = generic_array_bisect(f,
2099                                          le64toh(f->header->entry_array_offset),
2100                                          le64toh(f->header->n_entries),
2101                                          p,
2102                                          test_object_offset,
2103                                          DIRECTION_DOWN,
2104                                          NULL, NULL,
2105                                          &i);
2106                 if (r <= 0)
2107                         return r;
2108
2109                 if (direction == DIRECTION_DOWN) {
2110                         if (i >= n - 1)
2111                                 return 0;
2112
2113                         i++;
2114                 } else {
2115                         if (i <= 0)
2116                                 return 0;
2117
2118                         i--;
2119                 }
2120         }
2121
2122         /* And jump to it */
2123         r = generic_array_get(f,
2124                               le64toh(f->header->entry_array_offset),
2125                               i,
2126                               ret, &ofs);
2127         if (r <= 0)
2128                 return r;
2129
2130         if (p > 0 &&
2131             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2132                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2133                           f->path, i);
2134                 return -EBADMSG;
2135         }
2136
2137         if (offset)
2138                 *offset = ofs;
2139
2140         return 1;
2141 }
2142
2143 int journal_file_next_entry_for_data(
2144                 JournalFile *f,
2145                 Object *o, uint64_t p,
2146                 uint64_t data_offset,
2147                 direction_t direction,
2148                 Object **ret, uint64_t *offset) {
2149
2150         uint64_t n, i;
2151         int r;
2152         Object *d;
2153
2154         assert(f);
2155         assert(p > 0 || !o);
2156
2157         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2158         if (r < 0)
2159                 return r;
2160
2161         n = le64toh(d->data.n_entries);
2162         if (n <= 0)
2163                 return n;
2164
2165         if (!o)
2166                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2167         else {
2168                 if (o->object.type != OBJECT_ENTRY)
2169                         return -EINVAL;
2170
2171                 r = generic_array_bisect_plus_one(f,
2172                                                   le64toh(d->data.entry_offset),
2173                                                   le64toh(d->data.entry_array_offset),
2174                                                   le64toh(d->data.n_entries),
2175                                                   p,
2176                                                   test_object_offset,
2177                                                   DIRECTION_DOWN,
2178                                                   NULL, NULL,
2179                                                   &i);
2180
2181                 if (r <= 0)
2182                         return r;
2183
2184                 if (direction == DIRECTION_DOWN) {
2185                         if (i >= n - 1)
2186                                 return 0;
2187
2188                         i++;
2189                 } else {
2190                         if (i <= 0)
2191                                 return 0;
2192
2193                         i--;
2194                 }
2195
2196         }
2197
2198         return generic_array_get_plus_one(f,
2199                                           le64toh(d->data.entry_offset),
2200                                           le64toh(d->data.entry_array_offset),
2201                                           i,
2202                                           ret, offset);
2203 }
2204
2205 int journal_file_move_to_entry_by_offset_for_data(
2206                 JournalFile *f,
2207                 uint64_t data_offset,
2208                 uint64_t p,
2209                 direction_t direction,
2210                 Object **ret, uint64_t *offset) {
2211
2212         int r;
2213         Object *d;
2214
2215         assert(f);
2216
2217         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2218         if (r < 0)
2219                 return r;
2220
2221         return generic_array_bisect_plus_one(f,
2222                                              le64toh(d->data.entry_offset),
2223                                              le64toh(d->data.entry_array_offset),
2224                                              le64toh(d->data.n_entries),
2225                                              p,
2226                                              test_object_offset,
2227                                              direction,
2228                                              ret, offset, NULL);
2229 }
2230
2231 int journal_file_move_to_entry_by_monotonic_for_data(
2232                 JournalFile *f,
2233                 uint64_t data_offset,
2234                 sd_id128_t boot_id,
2235                 uint64_t monotonic,
2236                 direction_t direction,
2237                 Object **ret, uint64_t *offset) {
2238
2239         Object *o, *d;
2240         int r;
2241         uint64_t b, z;
2242
2243         assert(f);
2244
2245         /* First, seek by time */
2246         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2247         if (r < 0)
2248                 return r;
2249         if (r == 0)
2250                 return -ENOENT;
2251
2252         r = generic_array_bisect_plus_one(f,
2253                                           le64toh(o->data.entry_offset),
2254                                           le64toh(o->data.entry_array_offset),
2255                                           le64toh(o->data.n_entries),
2256                                           monotonic,
2257                                           test_object_monotonic,
2258                                           direction,
2259                                           NULL, &z, NULL);
2260         if (r <= 0)
2261                 return r;
2262
2263         /* And now, continue seeking until we find an entry that
2264          * exists in both bisection arrays */
2265
2266         for (;;) {
2267                 Object *qo;
2268                 uint64_t p, q;
2269
2270                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2271                 if (r < 0)
2272                         return r;
2273
2274                 r = generic_array_bisect_plus_one(f,
2275                                                   le64toh(d->data.entry_offset),
2276                                                   le64toh(d->data.entry_array_offset),
2277                                                   le64toh(d->data.n_entries),
2278                                                   z,
2279                                                   test_object_offset,
2280                                                   direction,
2281                                                   NULL, &p, NULL);
2282                 if (r <= 0)
2283                         return r;
2284
2285                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2286                 if (r < 0)
2287                         return r;
2288
2289                 r = generic_array_bisect_plus_one(f,
2290                                                   le64toh(o->data.entry_offset),
2291                                                   le64toh(o->data.entry_array_offset),
2292                                                   le64toh(o->data.n_entries),
2293                                                   p,
2294                                                   test_object_offset,
2295                                                   direction,
2296                                                   &qo, &q, NULL);
2297
2298                 if (r <= 0)
2299                         return r;
2300
2301                 if (p == q) {
2302                         if (ret)
2303                                 *ret = qo;
2304                         if (offset)
2305                                 *offset = q;
2306
2307                         return 1;
2308                 }
2309
2310                 z = q;
2311         }
2312 }
2313
2314 int journal_file_move_to_entry_by_seqnum_for_data(
2315                 JournalFile *f,
2316                 uint64_t data_offset,
2317                 uint64_t seqnum,
2318                 direction_t direction,
2319                 Object **ret, uint64_t *offset) {
2320
2321         Object *d;
2322         int r;
2323
2324         assert(f);
2325
2326         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2327         if (r < 0)
2328                 return r;
2329
2330         return generic_array_bisect_plus_one(f,
2331                                              le64toh(d->data.entry_offset),
2332                                              le64toh(d->data.entry_array_offset),
2333                                              le64toh(d->data.n_entries),
2334                                              seqnum,
2335                                              test_object_seqnum,
2336                                              direction,
2337                                              ret, offset, NULL);
2338 }
2339
2340 int journal_file_move_to_entry_by_realtime_for_data(
2341                 JournalFile *f,
2342                 uint64_t data_offset,
2343                 uint64_t realtime,
2344                 direction_t direction,
2345                 Object **ret, uint64_t *offset) {
2346
2347         Object *d;
2348         int r;
2349
2350         assert(f);
2351
2352         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2353         if (r < 0)
2354                 return r;
2355
2356         return generic_array_bisect_plus_one(f,
2357                                              le64toh(d->data.entry_offset),
2358                                              le64toh(d->data.entry_array_offset),
2359                                              le64toh(d->data.n_entries),
2360                                              realtime,
2361                                              test_object_realtime,
2362                                              direction,
2363                                              ret, offset, NULL);
2364 }
2365
2366 void journal_file_dump(JournalFile *f) {
2367         Object *o;
2368         int r;
2369         uint64_t p;
2370
2371         assert(f);
2372
2373         journal_file_print_header(f);
2374
2375         p = le64toh(f->header->header_size);
2376         while (p != 0) {
2377                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2378                 if (r < 0)
2379                         goto fail;
2380
2381                 switch (o->object.type) {
2382
2383                 case OBJECT_UNUSED:
2384                         printf("Type: OBJECT_UNUSED\n");
2385                         break;
2386
2387                 case OBJECT_DATA:
2388                         printf("Type: OBJECT_DATA\n");
2389                         break;
2390
2391                 case OBJECT_FIELD:
2392                         printf("Type: OBJECT_FIELD\n");
2393                         break;
2394
2395                 case OBJECT_ENTRY:
2396                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2397                                le64toh(o->entry.seqnum),
2398                                le64toh(o->entry.monotonic),
2399                                le64toh(o->entry.realtime));
2400                         break;
2401
2402                 case OBJECT_FIELD_HASH_TABLE:
2403                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2404                         break;
2405
2406                 case OBJECT_DATA_HASH_TABLE:
2407                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2408                         break;
2409
2410                 case OBJECT_ENTRY_ARRAY:
2411                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2412                         break;
2413
2414                 case OBJECT_TAG:
2415                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2416                                le64toh(o->tag.seqnum),
2417                                le64toh(o->tag.epoch));
2418                         break;
2419
2420                 default:
2421                         printf("Type: unknown (%i)\n", o->object.type);
2422                         break;
2423                 }
2424
2425                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2426                         printf("Flags: %s\n",
2427                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2428
2429                 if (p == le64toh(f->header->tail_object_offset))
2430                         p = 0;
2431                 else
2432                         p = p + ALIGN64(le64toh(o->object.size));
2433         }
2434
2435         return;
2436 fail:
2437         log_error("File corrupt");
2438 }
2439
2440 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2441         const char *x;
2442
2443         x = format_timestamp(buf, l, t);
2444         if (x)
2445                 return x;
2446         return " --- ";
2447 }
2448
2449 void journal_file_print_header(JournalFile *f) {
2450         char a[33], b[33], c[33], d[33];
2451         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2452         struct stat st;
2453         char bytes[FORMAT_BYTES_MAX];
2454
2455         assert(f);
2456
2457         printf("File Path: %s\n"
2458                "File ID: %s\n"
2459                "Machine ID: %s\n"
2460                "Boot ID: %s\n"
2461                "Sequential Number ID: %s\n"
2462                "State: %s\n"
2463                "Compatible Flags:%s%s\n"
2464                "Incompatible Flags:%s%s%s\n"
2465                "Header size: %"PRIu64"\n"
2466                "Arena size: %"PRIu64"\n"
2467                "Data Hash Table Size: %"PRIu64"\n"
2468                "Field Hash Table Size: %"PRIu64"\n"
2469                "Rotate Suggested: %s\n"
2470                "Head Sequential Number: %"PRIu64"\n"
2471                "Tail Sequential Number: %"PRIu64"\n"
2472                "Head Realtime Timestamp: %s\n"
2473                "Tail Realtime Timestamp: %s\n"
2474                "Tail Monotonic Timestamp: %s\n"
2475                "Objects: %"PRIu64"\n"
2476                "Entry Objects: %"PRIu64"\n",
2477                f->path,
2478                sd_id128_to_string(f->header->file_id, a),
2479                sd_id128_to_string(f->header->machine_id, b),
2480                sd_id128_to_string(f->header->boot_id, c),
2481                sd_id128_to_string(f->header->seqnum_id, d),
2482                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2483                f->header->state == STATE_ONLINE ? "ONLINE" :
2484                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2485                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2486                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2487                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2488                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2489                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2490                le64toh(f->header->header_size),
2491                le64toh(f->header->arena_size),
2492                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2493                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2494                yes_no(journal_file_rotate_suggested(f, 0)),
2495                le64toh(f->header->head_entry_seqnum),
2496                le64toh(f->header->tail_entry_seqnum),
2497                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2498                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2499                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2500                le64toh(f->header->n_objects),
2501                le64toh(f->header->n_entries));
2502
2503         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2504                 printf("Data Objects: %"PRIu64"\n"
2505                        "Data Hash Table Fill: %.1f%%\n",
2506                        le64toh(f->header->n_data),
2507                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2508
2509         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2510                 printf("Field Objects: %"PRIu64"\n"
2511                        "Field Hash Table Fill: %.1f%%\n",
2512                        le64toh(f->header->n_fields),
2513                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2514
2515         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2516                 printf("Tag Objects: %"PRIu64"\n",
2517                        le64toh(f->header->n_tags));
2518         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2519                 printf("Entry Array Objects: %"PRIu64"\n",
2520                        le64toh(f->header->n_entry_arrays));
2521
2522         if (fstat(f->fd, &st) >= 0)
2523                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2524 }
2525
2526 int journal_file_open(
2527                 const char *fname,
2528                 int flags,
2529                 mode_t mode,
2530                 bool compress,
2531                 bool seal,
2532                 JournalMetrics *metrics,
2533                 MMapCache *mmap_cache,
2534                 JournalFile *template,
2535                 JournalFile **ret) {
2536
2537         bool newly_created = false;
2538         JournalFile *f;
2539         void *h;
2540         int r;
2541
2542         assert(fname);
2543         assert(ret);
2544
2545         if ((flags & O_ACCMODE) != O_RDONLY &&
2546             (flags & O_ACCMODE) != O_RDWR)
2547                 return -EINVAL;
2548
2549         if (!endswith(fname, ".journal") &&
2550             !endswith(fname, ".journal~"))
2551                 return -EINVAL;
2552
2553         f = new0(JournalFile, 1);
2554         if (!f)
2555                 return -ENOMEM;
2556
2557         f->fd = -1;
2558         f->mode = mode;
2559
2560         f->flags = flags;
2561         f->prot = prot_from_flags(flags);
2562         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2563 #if defined(HAVE_LZ4)
2564         f->compress_lz4 = compress;
2565 #elif defined(HAVE_XZ)
2566         f->compress_xz = compress;
2567 #endif
2568 #ifdef HAVE_GCRYPT
2569         f->seal = seal;
2570 #endif
2571
2572         if (mmap_cache)
2573                 f->mmap = mmap_cache_ref(mmap_cache);
2574         else {
2575                 f->mmap = mmap_cache_new();
2576                 if (!f->mmap) {
2577                         r = -ENOMEM;
2578                         goto fail;
2579                 }
2580         }
2581
2582         f->path = strdup(fname);
2583         if (!f->path) {
2584                 r = -ENOMEM;
2585                 goto fail;
2586         }
2587
2588         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2589         if (!f->chain_cache) {
2590                 r = -ENOMEM;
2591                 goto fail;
2592         }
2593
2594         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2595         if (f->fd < 0) {
2596                 r = -errno;
2597                 goto fail;
2598         }
2599
2600         r = journal_file_fstat(f);
2601         if (r < 0)
2602                 goto fail;
2603
2604         if (f->last_stat.st_size == 0 && f->writable) {
2605
2606                 /* Before we write anything, turn off COW logic. Given
2607                  * our write pattern that is quite unfriendly to COW
2608                  * file systems this should greatly improve
2609                  * performance on COW file systems, such as btrfs, at
2610                  * the expense of data integrity features (which
2611                  * shouldn't be too bad, given that we do our own
2612                  * checksumming). */
2613                 r = chattr_fd(f->fd, true, FS_NOCOW_FL);
2614                 if (r < 0)
2615                         log_warning_errno(errno, "Failed to set file attributes: %m");
2616
2617                 /* Let's attach the creation time to the journal file,
2618                  * so that the vacuuming code knows the age of this
2619                  * file even if the file might end up corrupted one
2620                  * day... Ideally we'd just use the creation time many
2621                  * file systems maintain for each file, but there is
2622                  * currently no usable API to query this, hence let's
2623                  * emulate this via extended attributes. If extended
2624                  * attributes are not supported we'll just skip this,
2625                  * and rely solely on mtime/atime/ctime of the file. */
2626
2627                 fd_setcrtime(f->fd, 0);
2628
2629 #ifdef HAVE_GCRYPT
2630                 /* Try to load the FSPRG state, and if we can't, then
2631                  * just don't do sealing */
2632                 if (f->seal) {
2633                         r = journal_file_fss_load(f);
2634                         if (r < 0)
2635                                 f->seal = false;
2636                 }
2637 #endif
2638
2639                 r = journal_file_init_header(f, template);
2640                 if (r < 0)
2641                         goto fail;
2642
2643                 r = journal_file_fstat(f);
2644                 if (r < 0)
2645                         goto fail;
2646
2647                 newly_created = true;
2648         }
2649
2650         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2651                 r = -EIO;
2652                 goto fail;
2653         }
2654
2655         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2656         if (r < 0) {
2657                 r = -errno;
2658                 goto fail;
2659         }
2660
2661         f->header = h;
2662
2663         if (!newly_created) {
2664                 r = journal_file_verify_header(f);
2665                 if (r < 0)
2666                         goto fail;
2667         }
2668
2669 #ifdef HAVE_GCRYPT
2670         if (!newly_created && f->writable) {
2671                 r = journal_file_fss_load(f);
2672                 if (r < 0)
2673                         goto fail;
2674         }
2675 #endif
2676
2677         if (f->writable) {
2678                 if (metrics) {
2679                         journal_default_metrics(metrics, f->fd);
2680                         f->metrics = *metrics;
2681                 } else if (template)
2682                         f->metrics = template->metrics;
2683
2684                 r = journal_file_refresh_header(f);
2685                 if (r < 0)
2686                         goto fail;
2687         }
2688
2689 #ifdef HAVE_GCRYPT
2690         r = journal_file_hmac_setup(f);
2691         if (r < 0)
2692                 goto fail;
2693 #endif
2694
2695         if (newly_created) {
2696                 r = journal_file_setup_field_hash_table(f);
2697                 if (r < 0)
2698                         goto fail;
2699
2700                 r = journal_file_setup_data_hash_table(f);
2701                 if (r < 0)
2702                         goto fail;
2703
2704 #ifdef HAVE_GCRYPT
2705                 r = journal_file_append_first_tag(f);
2706                 if (r < 0)
2707                         goto fail;
2708 #endif
2709         }
2710
2711         r = journal_file_map_field_hash_table(f);
2712         if (r < 0)
2713                 goto fail;
2714
2715         r = journal_file_map_data_hash_table(f);
2716         if (r < 0)
2717                 goto fail;
2718
2719         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2720                 r = -EIO;
2721                 goto fail;
2722         }
2723
2724         *ret = f;
2725         return 0;
2726
2727 fail:
2728         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2729                 r = -EIO;
2730
2731         journal_file_close(f);
2732
2733         return r;
2734 }
2735
2736 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2737         _cleanup_free_ char *p = NULL;
2738         size_t l;
2739         JournalFile *old_file, *new_file = NULL;
2740         int r;
2741
2742         assert(f);
2743         assert(*f);
2744
2745         old_file = *f;
2746
2747         if (!old_file->writable)
2748                 return -EINVAL;
2749
2750         if (!endswith(old_file->path, ".journal"))
2751                 return -EINVAL;
2752
2753         l = strlen(old_file->path);
2754         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2755                      (int) l - 8, old_file->path,
2756                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2757                      le64toh((*f)->header->head_entry_seqnum),
2758                      le64toh((*f)->header->head_entry_realtime));
2759         if (r < 0)
2760                 return -ENOMEM;
2761
2762         /* Try to rename the file to the archived version. If the file
2763          * already was deleted, we'll get ENOENT, let's ignore that
2764          * case. */
2765         r = rename(old_file->path, p);
2766         if (r < 0 && errno != ENOENT)
2767                 return -errno;
2768
2769         old_file->header->state = STATE_ARCHIVED;
2770
2771         /* Currently, btrfs is not very good with out write patterns
2772          * and fragments heavily. Let's defrag our journal files when
2773          * we archive them */
2774         old_file->defrag_on_close = true;
2775
2776         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2777         journal_file_close(old_file);
2778
2779         *f = new_file;
2780         return r;
2781 }
2782
2783 int journal_file_open_reliably(
2784                 const char *fname,
2785                 int flags,
2786                 mode_t mode,
2787                 bool compress,
2788                 bool seal,
2789                 JournalMetrics *metrics,
2790                 MMapCache *mmap_cache,
2791                 JournalFile *template,
2792                 JournalFile **ret) {
2793
2794         int r;
2795         size_t l;
2796         _cleanup_free_ char *p = NULL;
2797
2798         r = journal_file_open(fname, flags, mode, compress, seal,
2799                               metrics, mmap_cache, template, ret);
2800         if (r != -EBADMSG && /* corrupted */
2801             r != -ENODATA && /* truncated */
2802             r != -EHOSTDOWN && /* other machine */
2803             r != -EPROTONOSUPPORT && /* incompatible feature */
2804             r != -EBUSY && /* unclean shutdown */
2805             r != -ESHUTDOWN && /* already archived */
2806             r != -EIO && /* IO error, including SIGBUS on mmap */
2807             r != -EIDRM /* File has been deleted */)
2808                 return r;
2809
2810         if ((flags & O_ACCMODE) == O_RDONLY)
2811                 return r;
2812
2813         if (!(flags & O_CREAT))
2814                 return r;
2815
2816         if (!endswith(fname, ".journal"))
2817                 return r;
2818
2819         /* The file is corrupted. Rotate it away and try it again (but only once) */
2820
2821         l = strlen(fname);
2822         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2823                      (int) l - 8, fname,
2824                      (unsigned long long) now(CLOCK_REALTIME),
2825                      random_u64()) < 0)
2826                 return -ENOMEM;
2827
2828         r = rename(fname, p);
2829         if (r < 0)
2830                 return -errno;
2831
2832         /* btrfs doesn't cope well with our write pattern and
2833          * fragments heavily. Let's defrag all files we rotate */
2834
2835         (void) chattr_path(p, false, FS_NOCOW_FL);
2836         (void) btrfs_defrag(p);
2837
2838         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2839
2840         return journal_file_open(fname, flags, mode, compress, seal,
2841                                  metrics, mmap_cache, template, ret);
2842 }
2843
2844 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2845         uint64_t i, n;
2846         uint64_t q, xor_hash = 0;
2847         int r;
2848         EntryItem *items;
2849         dual_timestamp ts;
2850
2851         assert(from);
2852         assert(to);
2853         assert(o);
2854         assert(p);
2855
2856         if (!to->writable)
2857                 return -EPERM;
2858
2859         ts.monotonic = le64toh(o->entry.monotonic);
2860         ts.realtime = le64toh(o->entry.realtime);
2861
2862         n = journal_file_entry_n_items(o);
2863         /* alloca() can't take 0, hence let's allocate at least one */
2864         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2865
2866         for (i = 0; i < n; i++) {
2867                 uint64_t l, h;
2868                 le64_t le_hash;
2869                 size_t t;
2870                 void *data;
2871                 Object *u;
2872
2873                 q = le64toh(o->entry.items[i].object_offset);
2874                 le_hash = o->entry.items[i].hash;
2875
2876                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2877                 if (r < 0)
2878                         return r;
2879
2880                 if (le_hash != o->data.hash)
2881                         return -EBADMSG;
2882
2883                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2884                 t = (size_t) l;
2885
2886                 /* We hit the limit on 32bit machines */
2887                 if ((uint64_t) t != l)
2888                         return -E2BIG;
2889
2890                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2891 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2892                         size_t rsize;
2893
2894                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2895                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2896                         if (r < 0)
2897                                 return r;
2898
2899                         data = from->compress_buffer;
2900                         l = rsize;
2901 #else
2902                         return -EPROTONOSUPPORT;
2903 #endif
2904                 } else
2905                         data = o->data.payload;
2906
2907                 r = journal_file_append_data(to, data, l, &u, &h);
2908                 if (r < 0)
2909                         return r;
2910
2911                 xor_hash ^= le64toh(u->data.hash);
2912                 items[i].object_offset = htole64(h);
2913                 items[i].hash = u->data.hash;
2914
2915                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2916                 if (r < 0)
2917                         return r;
2918         }
2919
2920         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2921
2922         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2923                 return -EIO;
2924
2925         return r;
2926 }
2927
2928 void journal_default_metrics(JournalMetrics *m, int fd) {
2929         uint64_t fs_size = 0;
2930         struct statvfs ss;
2931         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2932
2933         assert(m);
2934         assert(fd >= 0);
2935
2936         if (fstatvfs(fd, &ss) >= 0)
2937                 fs_size = ss.f_frsize * ss.f_blocks;
2938
2939         if (m->max_use == (uint64_t) -1) {
2940
2941                 if (fs_size > 0) {
2942                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2943
2944                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2945                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2946
2947                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2948                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2949                 } else
2950                         m->max_use = DEFAULT_MAX_USE_LOWER;
2951         } else {
2952                 m->max_use = PAGE_ALIGN(m->max_use);
2953
2954                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2955                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2956         }
2957
2958         if (m->max_size == (uint64_t) -1) {
2959                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2960
2961                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2962                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2963         } else
2964                 m->max_size = PAGE_ALIGN(m->max_size);
2965
2966         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2967                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2968
2969         if (m->max_size*2 > m->max_use)
2970                 m->max_use = m->max_size*2;
2971
2972         if (m->min_size == (uint64_t) -1)
2973                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2974         else {
2975                 m->min_size = PAGE_ALIGN(m->min_size);
2976
2977                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2978                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2979
2980                 if (m->min_size > m->max_size)
2981                         m->max_size = m->min_size;
2982         }
2983
2984         if (m->keep_free == (uint64_t) -1) {
2985
2986                 if (fs_size > 0) {
2987                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2988
2989                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2990                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2991
2992                 } else
2993                         m->keep_free = DEFAULT_KEEP_FREE;
2994         }
2995
2996         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2997                   format_bytes(a, sizeof(a), m->max_use),
2998                   format_bytes(b, sizeof(b), m->max_size),
2999                   format_bytes(c, sizeof(c), m->min_size),
3000                   format_bytes(d, sizeof(d), m->keep_free));
3001 }
3002
3003 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3004         assert(f);
3005         assert(from || to);
3006
3007         if (from) {
3008                 if (f->header->head_entry_realtime == 0)
3009                         return -ENOENT;
3010
3011                 *from = le64toh(f->header->head_entry_realtime);
3012         }
3013
3014         if (to) {
3015                 if (f->header->tail_entry_realtime == 0)
3016                         return -ENOENT;
3017
3018                 *to = le64toh(f->header->tail_entry_realtime);
3019         }
3020
3021         return 1;
3022 }
3023
3024 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3025         Object *o;
3026         uint64_t p;
3027         int r;
3028
3029         assert(f);
3030         assert(from || to);
3031
3032         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3033         if (r <= 0)
3034                 return r;
3035
3036         if (le64toh(o->data.n_entries) <= 0)
3037                 return 0;
3038
3039         if (from) {
3040                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3041                 if (r < 0)
3042                         return r;
3043
3044                 *from = le64toh(o->entry.monotonic);
3045         }
3046
3047         if (to) {
3048                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3049                 if (r < 0)
3050                         return r;
3051
3052                 r = generic_array_get_plus_one(f,
3053                                                le64toh(o->data.entry_offset),
3054                                                le64toh(o->data.entry_array_offset),
3055                                                le64toh(o->data.n_entries)-1,
3056                                                &o, NULL);
3057                 if (r <= 0)
3058                         return r;
3059
3060                 *to = le64toh(o->entry.monotonic);
3061         }
3062
3063         return 1;
3064 }
3065
3066 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3067         assert(f);
3068
3069         /* If we gained new header fields we gained new features,
3070          * hence suggest a rotation */
3071         if (le64toh(f->header->header_size) < sizeof(Header)) {
3072                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3073                 return true;
3074         }
3075
3076         /* Let's check if the hash tables grew over a certain fill
3077          * level (75%, borrowing this value from Java's hash table
3078          * implementation), and if so suggest a rotation. To calculate
3079          * the fill level we need the n_data field, which only exists
3080          * in newer versions. */
3081
3082         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3083                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3084                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3085                                   f->path,
3086                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3087                                   le64toh(f->header->n_data),
3088                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3089                                   (unsigned long long) f->last_stat.st_size,
3090                                   f->last_stat.st_size / le64toh(f->header->n_data));
3091                         return true;
3092                 }
3093
3094         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3095                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3096                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3097                                   f->path,
3098                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3099                                   le64toh(f->header->n_fields),
3100                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3101                         return true;
3102                 }
3103
3104         /* Are the data objects properly indexed by field objects? */
3105         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3106             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3107             le64toh(f->header->n_data) > 0 &&
3108             le64toh(f->header->n_fields) == 0)
3109                 return true;
3110
3111         if (max_file_usec > 0) {
3112                 usec_t t, h;
3113
3114                 h = le64toh(f->header->head_entry_realtime);
3115                 t = now(CLOCK_REALTIME);
3116
3117                 if (h > 0 && t > h + max_file_usec)
3118                         return true;
3119         }
3120
3121         return false;
3122 }