chiark / gitweb /
journal: fix Inappropriate ioctl for device on ext4
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <linux/fs.h>
30
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
35 #include "lookup3.h"
36 #include "compress.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47  * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55  * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59  * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
70
71 /* Reread fstat() of the file for detecting deletions at least this often */
72 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
73
74 /* The mmap context to use for the header we pick as one above the last defined typed */
75 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
76
77 static int journal_file_set_online(JournalFile *f) {
78         assert(f);
79
80         if (!f->writable)
81                 return -EPERM;
82
83         if (!(f->fd >= 0 && f->header))
84                 return -EINVAL;
85
86         if (mmap_cache_got_sigbus(f->mmap, f->fd))
87                 return -EIO;
88
89         switch(f->header->state) {
90                 case STATE_ONLINE:
91                         return 0;
92
93                 case STATE_OFFLINE:
94                         f->header->state = STATE_ONLINE;
95                         fsync(f->fd);
96                         return 0;
97
98                 default:
99                         return -EINVAL;
100         }
101 }
102
103 int journal_file_set_offline(JournalFile *f) {
104         assert(f);
105
106         if (!f->writable)
107                 return -EPERM;
108
109         if (!(f->fd >= 0 && f->header))
110                 return -EINVAL;
111
112         if (f->header->state != STATE_ONLINE)
113                 return 0;
114
115         fsync(f->fd);
116
117         if (mmap_cache_got_sigbus(f->mmap, f->fd))
118                 return -EIO;
119
120         f->header->state = STATE_OFFLINE;
121
122         if (mmap_cache_got_sigbus(f->mmap, f->fd))
123                 return -EIO;
124
125         fsync(f->fd);
126
127         return 0;
128 }
129
130 void journal_file_close(JournalFile *f) {
131         assert(f);
132
133 #ifdef HAVE_GCRYPT
134         /* Write the final tag */
135         if (f->seal && f->writable)
136                 journal_file_append_tag(f);
137 #endif
138
139         journal_file_set_offline(f);
140
141         if (f->mmap && f->fd >= 0)
142                 mmap_cache_close_fd(f->mmap, f->fd);
143
144         if (f->fd >= 0 && f->defrag_on_close) {
145
146                 /* Be friendly to btrfs: turn COW back on again now,
147                  * and defragment the file. We won't write to the file
148                  * ever again, hence remove all fragmentation, and
149                  * reenable all the good bits COW usually provides
150                  * (such as data checksumming). */
151
152                 (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
153                 (void) btrfs_defrag_fd(f->fd);
154         }
155
156         safe_close(f->fd);
157         free(f->path);
158
159         if (f->mmap)
160                 mmap_cache_unref(f->mmap);
161
162         ordered_hashmap_free_free(f->chain_cache);
163
164 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
165         free(f->compress_buffer);
166 #endif
167
168 #ifdef HAVE_GCRYPT
169         if (f->fss_file)
170                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
171         else if (f->fsprg_state)
172                 free(f->fsprg_state);
173
174         free(f->fsprg_seed);
175
176         if (f->hmac)
177                 gcry_md_close(f->hmac);
178 #endif
179
180         free(f);
181 }
182
183 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
184         Header h = {};
185         ssize_t k;
186         int r;
187
188         assert(f);
189
190         memcpy(h.signature, HEADER_SIGNATURE, 8);
191         h.header_size = htole64(ALIGN64(sizeof(h)));
192
193         h.incompatible_flags |= htole32(
194                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
195                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
196
197         h.compatible_flags = htole32(
198                 f->seal * HEADER_COMPATIBLE_SEALED);
199
200         r = sd_id128_randomize(&h.file_id);
201         if (r < 0)
202                 return r;
203
204         if (template) {
205                 h.seqnum_id = template->header->seqnum_id;
206                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
207         } else
208                 h.seqnum_id = h.file_id;
209
210         k = pwrite(f->fd, &h, sizeof(h), 0);
211         if (k < 0)
212                 return -errno;
213
214         if (k != sizeof(h))
215                 return -EIO;
216
217         return 0;
218 }
219
220 static int journal_file_refresh_header(JournalFile *f) {
221         sd_id128_t boot_id;
222         int r;
223
224         assert(f);
225
226         r = sd_id128_get_machine(&f->header->machine_id);
227         if (r < 0)
228                 return r;
229
230         r = sd_id128_get_boot(&boot_id);
231         if (r < 0)
232                 return r;
233
234         if (sd_id128_equal(boot_id, f->header->boot_id))
235                 f->tail_entry_monotonic_valid = true;
236
237         f->header->boot_id = boot_id;
238
239         r = journal_file_set_online(f);
240
241         /* Sync the online state to disk */
242         fsync(f->fd);
243
244         return r;
245 }
246
247 static int journal_file_verify_header(JournalFile *f) {
248         uint32_t flags;
249
250         assert(f);
251
252         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
253                 return -EBADMSG;
254
255         /* In both read and write mode we refuse to open files with
256          * incompatible flags we don't know */
257         flags = le32toh(f->header->incompatible_flags);
258         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
259                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
260                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
261                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
262                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
263                 if (flags)
264                         log_debug("Journal file %s uses incompatible flags %"PRIx32
265                                   " disabled at compilation time.", f->path, flags);
266                 return -EPROTONOSUPPORT;
267         }
268
269         /* When open for writing we refuse to open files with
270          * compatible flags, too */
271         flags = le32toh(f->header->compatible_flags);
272         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
273                 if (flags & ~HEADER_COMPATIBLE_ANY)
274                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
275                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
276                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
277                 if (flags)
278                         log_debug("Journal file %s uses compatible flags %"PRIx32
279                                   " disabled at compilation time.", f->path, flags);
280                 return -EPROTONOSUPPORT;
281         }
282
283         if (f->header->state >= _STATE_MAX)
284                 return -EBADMSG;
285
286         /* The first addition was n_data, so check that we are at least this large */
287         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
288                 return -EBADMSG;
289
290         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
291                 return -EBADMSG;
292
293         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
294                 return -ENODATA;
295
296         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
297                 return -ENODATA;
298
299         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
300             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
301             !VALID64(le64toh(f->header->tail_object_offset)) ||
302             !VALID64(le64toh(f->header->entry_array_offset)))
303                 return -ENODATA;
304
305         if (f->writable) {
306                 uint8_t state;
307                 sd_id128_t machine_id;
308                 int r;
309
310                 r = sd_id128_get_machine(&machine_id);
311                 if (r < 0)
312                         return r;
313
314                 if (!sd_id128_equal(machine_id, f->header->machine_id))
315                         return -EHOSTDOWN;
316
317                 state = f->header->state;
318
319                 if (state == STATE_ONLINE) {
320                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
321                         return -EBUSY;
322                 } else if (state == STATE_ARCHIVED)
323                         return -ESHUTDOWN;
324                 else if (state != STATE_OFFLINE) {
325                         log_debug("Journal file %s has unknown state %i.", f->path, state);
326                         return -EBUSY;
327                 }
328         }
329
330         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
331         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
332
333         f->seal = JOURNAL_HEADER_SEALED(f->header);
334
335         return 0;
336 }
337
338 static int journal_file_fstat(JournalFile *f) {
339         assert(f);
340         assert(f->fd >= 0);
341
342         if (fstat(f->fd, &f->last_stat) < 0)
343                 return -errno;
344
345         f->last_stat_usec = now(CLOCK_MONOTONIC);
346
347         /* Refuse appending to files that are already deleted */
348         if (f->last_stat.st_nlink <= 0)
349                 return -EIDRM;
350
351         return 0;
352 }
353
354 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
355         uint64_t old_size, new_size;
356         int r;
357
358         assert(f);
359
360         /* We assume that this file is not sparse, and we know that
361          * for sure, since we always call posix_fallocate()
362          * ourselves */
363
364         if (mmap_cache_got_sigbus(f->mmap, f->fd))
365                 return -EIO;
366
367         old_size =
368                 le64toh(f->header->header_size) +
369                 le64toh(f->header->arena_size);
370
371         new_size = PAGE_ALIGN(offset + size);
372         if (new_size < le64toh(f->header->header_size))
373                 new_size = le64toh(f->header->header_size);
374
375         if (new_size <= old_size) {
376
377                 /* We already pre-allocated enough space, but before
378                  * we write to it, let's check with fstat() if the
379                  * file got deleted, in order make sure we don't throw
380                  * away the data immediately. Don't check fstat() for
381                  * all writes though, but only once ever 10s. */
382
383                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
384                         return 0;
385
386                 return journal_file_fstat(f);
387         }
388
389         /* Allocate more space. */
390
391         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
392                 return -E2BIG;
393
394         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
395                 struct statvfs svfs;
396
397                 if (fstatvfs(f->fd, &svfs) >= 0) {
398                         uint64_t available;
399
400                         available = svfs.f_bfree * svfs.f_bsize;
401
402                         if (available >= f->metrics.keep_free)
403                                 available -= f->metrics.keep_free;
404                         else
405                                 available = 0;
406
407                         if (new_size - old_size > available)
408                                 return -E2BIG;
409                 }
410         }
411
412         /* Increase by larger blocks at once */
413         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
414         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
415                 new_size = f->metrics.max_size;
416
417         /* Note that the glibc fallocate() fallback is very
418            inefficient, hence we try to minimize the allocation area
419            as we can. */
420         r = posix_fallocate(f->fd, old_size, new_size - old_size);
421         if (r != 0)
422                 return -r;
423
424         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
425
426         return journal_file_fstat(f);
427 }
428
429 static unsigned type_to_context(ObjectType type) {
430         /* One context for each type, plus one catch-all for the rest */
431         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
432         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
433         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
434 }
435
436 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
437         int r;
438
439         assert(f);
440         assert(ret);
441
442         if (size <= 0)
443                 return -EINVAL;
444
445         /* Avoid SIGBUS on invalid accesses */
446         if (offset + size > (uint64_t) f->last_stat.st_size) {
447                 /* Hmm, out of range? Let's refresh the fstat() data
448                  * first, before we trust that check. */
449
450                 r = journal_file_fstat(f);
451                 if (r < 0)
452                         return r;
453
454                 if (offset + size > (uint64_t) f->last_stat.st_size)
455                         return -EADDRNOTAVAIL;
456         }
457
458         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
459 }
460
461 static uint64_t minimum_header_size(Object *o) {
462
463         static const uint64_t table[] = {
464                 [OBJECT_DATA] = sizeof(DataObject),
465                 [OBJECT_FIELD] = sizeof(FieldObject),
466                 [OBJECT_ENTRY] = sizeof(EntryObject),
467                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
468                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
469                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
470                 [OBJECT_TAG] = sizeof(TagObject),
471         };
472
473         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
474                 return sizeof(ObjectHeader);
475
476         return table[o->object.type];
477 }
478
479 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
480         int r;
481         void *t;
482         Object *o;
483         uint64_t s;
484
485         assert(f);
486         assert(ret);
487
488         /* Objects may only be located at multiple of 64 bit */
489         if (!VALID64(offset))
490                 return -EFAULT;
491
492         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
493         if (r < 0)
494                 return r;
495
496         o = (Object*) t;
497         s = le64toh(o->object.size);
498
499         if (s < sizeof(ObjectHeader))
500                 return -EBADMSG;
501
502         if (o->object.type <= OBJECT_UNUSED)
503                 return -EBADMSG;
504
505         if (s < minimum_header_size(o))
506                 return -EBADMSG;
507
508         if (type > OBJECT_UNUSED && o->object.type != type)
509                 return -EBADMSG;
510
511         if (s > sizeof(ObjectHeader)) {
512                 r = journal_file_move_to(f, type, false, offset, s, &t);
513                 if (r < 0)
514                         return r;
515
516                 o = (Object*) t;
517         }
518
519         *ret = o;
520         return 0;
521 }
522
523 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
524         uint64_t r;
525
526         assert(f);
527
528         r = le64toh(f->header->tail_entry_seqnum) + 1;
529
530         if (seqnum) {
531                 /* If an external seqnum counter was passed, we update
532                  * both the local and the external one, and set it to
533                  * the maximum of both */
534
535                 if (*seqnum + 1 > r)
536                         r = *seqnum + 1;
537
538                 *seqnum = r;
539         }
540
541         f->header->tail_entry_seqnum = htole64(r);
542
543         if (f->header->head_entry_seqnum == 0)
544                 f->header->head_entry_seqnum = htole64(r);
545
546         return r;
547 }
548
549 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
550         int r;
551         uint64_t p;
552         Object *tail, *o;
553         void *t;
554
555         assert(f);
556         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
557         assert(size >= sizeof(ObjectHeader));
558         assert(offset);
559         assert(ret);
560
561         r = journal_file_set_online(f);
562         if (r < 0)
563                 return r;
564
565         p = le64toh(f->header->tail_object_offset);
566         if (p == 0)
567                 p = le64toh(f->header->header_size);
568         else {
569                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
570                 if (r < 0)
571                         return r;
572
573                 p += ALIGN64(le64toh(tail->object.size));
574         }
575
576         r = journal_file_allocate(f, p, size);
577         if (r < 0)
578                 return r;
579
580         r = journal_file_move_to(f, type, false, p, size, &t);
581         if (r < 0)
582                 return r;
583
584         o = (Object*) t;
585
586         zero(o->object);
587         o->object.type = type;
588         o->object.size = htole64(size);
589
590         f->header->tail_object_offset = htole64(p);
591         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
592
593         *ret = o;
594         *offset = p;
595
596         return 0;
597 }
598
599 static int journal_file_setup_data_hash_table(JournalFile *f) {
600         uint64_t s, p;
601         Object *o;
602         int r;
603
604         assert(f);
605
606         /* We estimate that we need 1 hash table entry per 768 of
607            journal file and we want to make sure we never get beyond
608            75% fill level. Calculate the hash table size for the
609            maximum file size based on these metrics. */
610
611         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
612         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
613                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
614
615         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
616
617         r = journal_file_append_object(f,
618                                        OBJECT_DATA_HASH_TABLE,
619                                        offsetof(Object, hash_table.items) + s,
620                                        &o, &p);
621         if (r < 0)
622                 return r;
623
624         memzero(o->hash_table.items, s);
625
626         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
627         f->header->data_hash_table_size = htole64(s);
628
629         return 0;
630 }
631
632 static int journal_file_setup_field_hash_table(JournalFile *f) {
633         uint64_t s, p;
634         Object *o;
635         int r;
636
637         assert(f);
638
639         /* We use a fixed size hash table for the fields as this
640          * number should grow very slowly only */
641
642         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
643         r = journal_file_append_object(f,
644                                        OBJECT_FIELD_HASH_TABLE,
645                                        offsetof(Object, hash_table.items) + s,
646                                        &o, &p);
647         if (r < 0)
648                 return r;
649
650         memzero(o->hash_table.items, s);
651
652         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
653         f->header->field_hash_table_size = htole64(s);
654
655         return 0;
656 }
657
658 static int journal_file_map_data_hash_table(JournalFile *f) {
659         uint64_t s, p;
660         void *t;
661         int r;
662
663         assert(f);
664
665         p = le64toh(f->header->data_hash_table_offset);
666         s = le64toh(f->header->data_hash_table_size);
667
668         r = journal_file_move_to(f,
669                                  OBJECT_DATA_HASH_TABLE,
670                                  true,
671                                  p, s,
672                                  &t);
673         if (r < 0)
674                 return r;
675
676         f->data_hash_table = t;
677         return 0;
678 }
679
680 static int journal_file_map_field_hash_table(JournalFile *f) {
681         uint64_t s, p;
682         void *t;
683         int r;
684
685         assert(f);
686
687         p = le64toh(f->header->field_hash_table_offset);
688         s = le64toh(f->header->field_hash_table_size);
689
690         r = journal_file_move_to(f,
691                                  OBJECT_FIELD_HASH_TABLE,
692                                  true,
693                                  p, s,
694                                  &t);
695         if (r < 0)
696                 return r;
697
698         f->field_hash_table = t;
699         return 0;
700 }
701
702 static int journal_file_link_field(
703                 JournalFile *f,
704                 Object *o,
705                 uint64_t offset,
706                 uint64_t hash) {
707
708         uint64_t p, h, m;
709         int r;
710
711         assert(f);
712         assert(o);
713         assert(offset > 0);
714
715         if (o->object.type != OBJECT_FIELD)
716                 return -EINVAL;
717
718         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
719         if (m <= 0)
720                 return -EBADMSG;
721
722         /* This might alter the window we are looking at */
723         o->field.next_hash_offset = o->field.head_data_offset = 0;
724
725         h = hash % m;
726         p = le64toh(f->field_hash_table[h].tail_hash_offset);
727         if (p == 0)
728                 f->field_hash_table[h].head_hash_offset = htole64(offset);
729         else {
730                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
731                 if (r < 0)
732                         return r;
733
734                 o->field.next_hash_offset = htole64(offset);
735         }
736
737         f->field_hash_table[h].tail_hash_offset = htole64(offset);
738
739         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
740                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
741
742         return 0;
743 }
744
745 static int journal_file_link_data(
746                 JournalFile *f,
747                 Object *o,
748                 uint64_t offset,
749                 uint64_t hash) {
750
751         uint64_t p, h, m;
752         int r;
753
754         assert(f);
755         assert(o);
756         assert(offset > 0);
757
758         if (o->object.type != OBJECT_DATA)
759                 return -EINVAL;
760
761         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
762         if (m <= 0)
763                 return -EBADMSG;
764
765         /* This might alter the window we are looking at */
766         o->data.next_hash_offset = o->data.next_field_offset = 0;
767         o->data.entry_offset = o->data.entry_array_offset = 0;
768         o->data.n_entries = 0;
769
770         h = hash % m;
771         p = le64toh(f->data_hash_table[h].tail_hash_offset);
772         if (p == 0)
773                 /* Only entry in the hash table is easy */
774                 f->data_hash_table[h].head_hash_offset = htole64(offset);
775         else {
776                 /* Move back to the previous data object, to patch in
777                  * pointer */
778
779                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
780                 if (r < 0)
781                         return r;
782
783                 o->data.next_hash_offset = htole64(offset);
784         }
785
786         f->data_hash_table[h].tail_hash_offset = htole64(offset);
787
788         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
789                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
790
791         return 0;
792 }
793
794 int journal_file_find_field_object_with_hash(
795                 JournalFile *f,
796                 const void *field, uint64_t size, uint64_t hash,
797                 Object **ret, uint64_t *offset) {
798
799         uint64_t p, osize, h, m;
800         int r;
801
802         assert(f);
803         assert(field && size > 0);
804
805         osize = offsetof(Object, field.payload) + size;
806
807         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
808
809         if (m <= 0)
810                 return -EBADMSG;
811
812         h = hash % m;
813         p = le64toh(f->field_hash_table[h].head_hash_offset);
814
815         while (p > 0) {
816                 Object *o;
817
818                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
819                 if (r < 0)
820                         return r;
821
822                 if (le64toh(o->field.hash) == hash &&
823                     le64toh(o->object.size) == osize &&
824                     memcmp(o->field.payload, field, size) == 0) {
825
826                         if (ret)
827                                 *ret = o;
828                         if (offset)
829                                 *offset = p;
830
831                         return 1;
832                 }
833
834                 p = le64toh(o->field.next_hash_offset);
835         }
836
837         return 0;
838 }
839
840 int journal_file_find_field_object(
841                 JournalFile *f,
842                 const void *field, uint64_t size,
843                 Object **ret, uint64_t *offset) {
844
845         uint64_t hash;
846
847         assert(f);
848         assert(field && size > 0);
849
850         hash = hash64(field, size);
851
852         return journal_file_find_field_object_with_hash(f,
853                                                         field, size, hash,
854                                                         ret, offset);
855 }
856
857 int journal_file_find_data_object_with_hash(
858                 JournalFile *f,
859                 const void *data, uint64_t size, uint64_t hash,
860                 Object **ret, uint64_t *offset) {
861
862         uint64_t p, osize, h, m;
863         int r;
864
865         assert(f);
866         assert(data || size == 0);
867
868         osize = offsetof(Object, data.payload) + size;
869
870         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
871         if (m <= 0)
872                 return -EBADMSG;
873
874         h = hash % m;
875         p = le64toh(f->data_hash_table[h].head_hash_offset);
876
877         while (p > 0) {
878                 Object *o;
879
880                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
881                 if (r < 0)
882                         return r;
883
884                 if (le64toh(o->data.hash) != hash)
885                         goto next;
886
887                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
888 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
889                         uint64_t l;
890                         size_t rsize;
891
892                         l = le64toh(o->object.size);
893                         if (l <= offsetof(Object, data.payload))
894                                 return -EBADMSG;
895
896                         l -= offsetof(Object, data.payload);
897
898                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
899                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
900                         if (r < 0)
901                                 return r;
902
903                         if (rsize == size &&
904                             memcmp(f->compress_buffer, data, size) == 0) {
905
906                                 if (ret)
907                                         *ret = o;
908
909                                 if (offset)
910                                         *offset = p;
911
912                                 return 1;
913                         }
914 #else
915                         return -EPROTONOSUPPORT;
916 #endif
917                 } else if (le64toh(o->object.size) == osize &&
918                            memcmp(o->data.payload, data, size) == 0) {
919
920                         if (ret)
921                                 *ret = o;
922
923                         if (offset)
924                                 *offset = p;
925
926                         return 1;
927                 }
928
929         next:
930                 p = le64toh(o->data.next_hash_offset);
931         }
932
933         return 0;
934 }
935
936 int journal_file_find_data_object(
937                 JournalFile *f,
938                 const void *data, uint64_t size,
939                 Object **ret, uint64_t *offset) {
940
941         uint64_t hash;
942
943         assert(f);
944         assert(data || size == 0);
945
946         hash = hash64(data, size);
947
948         return journal_file_find_data_object_with_hash(f,
949                                                        data, size, hash,
950                                                        ret, offset);
951 }
952
953 static int journal_file_append_field(
954                 JournalFile *f,
955                 const void *field, uint64_t size,
956                 Object **ret, uint64_t *offset) {
957
958         uint64_t hash, p;
959         uint64_t osize;
960         Object *o;
961         int r;
962
963         assert(f);
964         assert(field && size > 0);
965
966         hash = hash64(field, size);
967
968         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
969         if (r < 0)
970                 return r;
971         else if (r > 0) {
972
973                 if (ret)
974                         *ret = o;
975
976                 if (offset)
977                         *offset = p;
978
979                 return 0;
980         }
981
982         osize = offsetof(Object, field.payload) + size;
983         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
984         if (r < 0)
985                 return r;
986
987         o->field.hash = htole64(hash);
988         memcpy(o->field.payload, field, size);
989
990         r = journal_file_link_field(f, o, p, hash);
991         if (r < 0)
992                 return r;
993
994         /* The linking might have altered the window, so let's
995          * refresh our pointer */
996         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
997         if (r < 0)
998                 return r;
999
1000 #ifdef HAVE_GCRYPT
1001         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1002         if (r < 0)
1003                 return r;
1004 #endif
1005
1006         if (ret)
1007                 *ret = o;
1008
1009         if (offset)
1010                 *offset = p;
1011
1012         return 0;
1013 }
1014
1015 static int journal_file_append_data(
1016                 JournalFile *f,
1017                 const void *data, uint64_t size,
1018                 Object **ret, uint64_t *offset) {
1019
1020         uint64_t hash, p;
1021         uint64_t osize;
1022         Object *o;
1023         int r, compression = 0;
1024         const void *eq;
1025
1026         assert(f);
1027         assert(data || size == 0);
1028
1029         hash = hash64(data, size);
1030
1031         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1032         if (r < 0)
1033                 return r;
1034         else if (r > 0) {
1035
1036                 if (ret)
1037                         *ret = o;
1038
1039                 if (offset)
1040                         *offset = p;
1041
1042                 return 0;
1043         }
1044
1045         osize = offsetof(Object, data.payload) + size;
1046         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1047         if (r < 0)
1048                 return r;
1049
1050         o->data.hash = htole64(hash);
1051
1052 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1053         if (f->compress_xz &&
1054             size >= COMPRESSION_SIZE_THRESHOLD) {
1055                 size_t rsize;
1056
1057                 compression = compress_blob(data, size, o->data.payload, &rsize);
1058
1059                 if (compression) {
1060                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1061                         o->object.flags |= compression;
1062
1063                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1064                                   size, rsize, object_compressed_to_string(compression));
1065                 }
1066         }
1067 #endif
1068
1069         if (!compression && size > 0)
1070                 memcpy(o->data.payload, data, size);
1071
1072         r = journal_file_link_data(f, o, p, hash);
1073         if (r < 0)
1074                 return r;
1075
1076         /* The linking might have altered the window, so let's
1077          * refresh our pointer */
1078         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1079         if (r < 0)
1080                 return r;
1081
1082         if (!data)
1083                 eq = NULL;
1084         else
1085                 eq = memchr(data, '=', size);
1086         if (eq && eq > data) {
1087                 Object *fo = NULL;
1088                 uint64_t fp;
1089
1090                 /* Create field object ... */
1091                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1092                 if (r < 0)
1093                         return r;
1094
1095                 /* ... and link it in. */
1096                 o->data.next_field_offset = fo->field.head_data_offset;
1097                 fo->field.head_data_offset = le64toh(p);
1098         }
1099
1100 #ifdef HAVE_GCRYPT
1101         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1102         if (r < 0)
1103                 return r;
1104 #endif
1105
1106         if (ret)
1107                 *ret = o;
1108
1109         if (offset)
1110                 *offset = p;
1111
1112         return 0;
1113 }
1114
1115 uint64_t journal_file_entry_n_items(Object *o) {
1116         assert(o);
1117
1118         if (o->object.type != OBJECT_ENTRY)
1119                 return 0;
1120
1121         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1122 }
1123
1124 uint64_t journal_file_entry_array_n_items(Object *o) {
1125         assert(o);
1126
1127         if (o->object.type != OBJECT_ENTRY_ARRAY)
1128                 return 0;
1129
1130         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1131 }
1132
1133 uint64_t journal_file_hash_table_n_items(Object *o) {
1134         assert(o);
1135
1136         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1137             o->object.type != OBJECT_FIELD_HASH_TABLE)
1138                 return 0;
1139
1140         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1141 }
1142
1143 static int link_entry_into_array(JournalFile *f,
1144                                  le64_t *first,
1145                                  le64_t *idx,
1146                                  uint64_t p) {
1147         int r;
1148         uint64_t n = 0, ap = 0, q, i, a, hidx;
1149         Object *o;
1150
1151         assert(f);
1152         assert(first);
1153         assert(idx);
1154         assert(p > 0);
1155
1156         a = le64toh(*first);
1157         i = hidx = le64toh(*idx);
1158         while (a > 0) {
1159
1160                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1161                 if (r < 0)
1162                         return r;
1163
1164                 n = journal_file_entry_array_n_items(o);
1165                 if (i < n) {
1166                         o->entry_array.items[i] = htole64(p);
1167                         *idx = htole64(hidx + 1);
1168                         return 0;
1169                 }
1170
1171                 i -= n;
1172                 ap = a;
1173                 a = le64toh(o->entry_array.next_entry_array_offset);
1174         }
1175
1176         if (hidx > n)
1177                 n = (hidx+1) * 2;
1178         else
1179                 n = n * 2;
1180
1181         if (n < 4)
1182                 n = 4;
1183
1184         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1185                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1186                                        &o, &q);
1187         if (r < 0)
1188                 return r;
1189
1190 #ifdef HAVE_GCRYPT
1191         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1192         if (r < 0)
1193                 return r;
1194 #endif
1195
1196         o->entry_array.items[i] = htole64(p);
1197
1198         if (ap == 0)
1199                 *first = htole64(q);
1200         else {
1201                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1202                 if (r < 0)
1203                         return r;
1204
1205                 o->entry_array.next_entry_array_offset = htole64(q);
1206         }
1207
1208         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1209                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1210
1211         *idx = htole64(hidx + 1);
1212
1213         return 0;
1214 }
1215
1216 static int link_entry_into_array_plus_one(JournalFile *f,
1217                                           le64_t *extra,
1218                                           le64_t *first,
1219                                           le64_t *idx,
1220                                           uint64_t p) {
1221
1222         int r;
1223
1224         assert(f);
1225         assert(extra);
1226         assert(first);
1227         assert(idx);
1228         assert(p > 0);
1229
1230         if (*idx == 0)
1231                 *extra = htole64(p);
1232         else {
1233                 le64_t i;
1234
1235                 i = htole64(le64toh(*idx) - 1);
1236                 r = link_entry_into_array(f, first, &i, p);
1237                 if (r < 0)
1238                         return r;
1239         }
1240
1241         *idx = htole64(le64toh(*idx) + 1);
1242         return 0;
1243 }
1244
1245 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1246         uint64_t p;
1247         int r;
1248         assert(f);
1249         assert(o);
1250         assert(offset > 0);
1251
1252         p = le64toh(o->entry.items[i].object_offset);
1253         if (p == 0)
1254                 return -EINVAL;
1255
1256         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1257         if (r < 0)
1258                 return r;
1259
1260         return link_entry_into_array_plus_one(f,
1261                                               &o->data.entry_offset,
1262                                               &o->data.entry_array_offset,
1263                                               &o->data.n_entries,
1264                                               offset);
1265 }
1266
1267 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1268         uint64_t n, i;
1269         int r;
1270
1271         assert(f);
1272         assert(o);
1273         assert(offset > 0);
1274
1275         if (o->object.type != OBJECT_ENTRY)
1276                 return -EINVAL;
1277
1278         __sync_synchronize();
1279
1280         /* Link up the entry itself */
1281         r = link_entry_into_array(f,
1282                                   &f->header->entry_array_offset,
1283                                   &f->header->n_entries,
1284                                   offset);
1285         if (r < 0)
1286                 return r;
1287
1288         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1289
1290         if (f->header->head_entry_realtime == 0)
1291                 f->header->head_entry_realtime = o->entry.realtime;
1292
1293         f->header->tail_entry_realtime = o->entry.realtime;
1294         f->header->tail_entry_monotonic = o->entry.monotonic;
1295
1296         f->tail_entry_monotonic_valid = true;
1297
1298         /* Link up the items */
1299         n = journal_file_entry_n_items(o);
1300         for (i = 0; i < n; i++) {
1301                 r = journal_file_link_entry_item(f, o, offset, i);
1302                 if (r < 0)
1303                         return r;
1304         }
1305
1306         return 0;
1307 }
1308
1309 static int journal_file_append_entry_internal(
1310                 JournalFile *f,
1311                 const dual_timestamp *ts,
1312                 uint64_t xor_hash,
1313                 const EntryItem items[], unsigned n_items,
1314                 uint64_t *seqnum,
1315                 Object **ret, uint64_t *offset) {
1316         uint64_t np;
1317         uint64_t osize;
1318         Object *o;
1319         int r;
1320
1321         assert(f);
1322         assert(items || n_items == 0);
1323         assert(ts);
1324
1325         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1326
1327         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1328         if (r < 0)
1329                 return r;
1330
1331         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1332         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1333         o->entry.realtime = htole64(ts->realtime);
1334         o->entry.monotonic = htole64(ts->monotonic);
1335         o->entry.xor_hash = htole64(xor_hash);
1336         o->entry.boot_id = f->header->boot_id;
1337
1338 #ifdef HAVE_GCRYPT
1339         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1340         if (r < 0)
1341                 return r;
1342 #endif
1343
1344         r = journal_file_link_entry(f, o, np);
1345         if (r < 0)
1346                 return r;
1347
1348         if (ret)
1349                 *ret = o;
1350
1351         if (offset)
1352                 *offset = np;
1353
1354         return 0;
1355 }
1356
1357 void journal_file_post_change(JournalFile *f) {
1358         assert(f);
1359
1360         /* inotify() does not receive IN_MODIFY events from file
1361          * accesses done via mmap(). After each access we hence
1362          * trigger IN_MODIFY by truncating the journal file to its
1363          * current size which triggers IN_MODIFY. */
1364
1365         __sync_synchronize();
1366
1367         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1368                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1369 }
1370
1371 static int entry_item_cmp(const void *_a, const void *_b) {
1372         const EntryItem *a = _a, *b = _b;
1373
1374         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1375                 return -1;
1376         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1377                 return 1;
1378         return 0;
1379 }
1380
1381 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1382         unsigned i;
1383         EntryItem *items;
1384         int r;
1385         uint64_t xor_hash = 0;
1386         struct dual_timestamp _ts;
1387
1388         assert(f);
1389         assert(iovec || n_iovec == 0);
1390
1391         if (!ts) {
1392                 dual_timestamp_get(&_ts);
1393                 ts = &_ts;
1394         }
1395
1396         if (f->tail_entry_monotonic_valid &&
1397             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1398                 return -EINVAL;
1399
1400 #ifdef HAVE_GCRYPT
1401         r = journal_file_maybe_append_tag(f, ts->realtime);
1402         if (r < 0)
1403                 return r;
1404 #endif
1405
1406         /* alloca() can't take 0, hence let's allocate at least one */
1407         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1408
1409         for (i = 0; i < n_iovec; i++) {
1410                 uint64_t p;
1411                 Object *o;
1412
1413                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1414                 if (r < 0)
1415                         return r;
1416
1417                 xor_hash ^= le64toh(o->data.hash);
1418                 items[i].object_offset = htole64(p);
1419                 items[i].hash = o->data.hash;
1420         }
1421
1422         /* Order by the position on disk, in order to improve seek
1423          * times for rotating media. */
1424         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1425
1426         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1427
1428         /* If the memory mapping triggered a SIGBUS then we return an
1429          * IO error and ignore the error code passed down to us, since
1430          * it is very likely just an effect of a nullified replacement
1431          * mapping page */
1432
1433         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1434                 r = -EIO;
1435
1436         journal_file_post_change(f);
1437
1438         return r;
1439 }
1440
1441 typedef struct ChainCacheItem {
1442         uint64_t first; /* the array at the beginning of the chain */
1443         uint64_t array; /* the cached array */
1444         uint64_t begin; /* the first item in the cached array */
1445         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1446         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1447 } ChainCacheItem;
1448
1449 static void chain_cache_put(
1450                 OrderedHashmap *h,
1451                 ChainCacheItem *ci,
1452                 uint64_t first,
1453                 uint64_t array,
1454                 uint64_t begin,
1455                 uint64_t total,
1456                 uint64_t last_index) {
1457
1458         if (!ci) {
1459                 /* If the chain item to cache for this chain is the
1460                  * first one it's not worth caching anything */
1461                 if (array == first)
1462                         return;
1463
1464                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1465                         ci = ordered_hashmap_steal_first(h);
1466                         assert(ci);
1467                 } else {
1468                         ci = new(ChainCacheItem, 1);
1469                         if (!ci)
1470                                 return;
1471                 }
1472
1473                 ci->first = first;
1474
1475                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1476                         free(ci);
1477                         return;
1478                 }
1479         } else
1480                 assert(ci->first == first);
1481
1482         ci->array = array;
1483         ci->begin = begin;
1484         ci->total = total;
1485         ci->last_index = last_index;
1486 }
1487
1488 static int generic_array_get(
1489                 JournalFile *f,
1490                 uint64_t first,
1491                 uint64_t i,
1492                 Object **ret, uint64_t *offset) {
1493
1494         Object *o;
1495         uint64_t p = 0, a, t = 0;
1496         int r;
1497         ChainCacheItem *ci;
1498
1499         assert(f);
1500
1501         a = first;
1502
1503         /* Try the chain cache first */
1504         ci = ordered_hashmap_get(f->chain_cache, &first);
1505         if (ci && i > ci->total) {
1506                 a = ci->array;
1507                 i -= ci->total;
1508                 t = ci->total;
1509         }
1510
1511         while (a > 0) {
1512                 uint64_t k;
1513
1514                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1515                 if (r < 0)
1516                         return r;
1517
1518                 k = journal_file_entry_array_n_items(o);
1519                 if (i < k) {
1520                         p = le64toh(o->entry_array.items[i]);
1521                         goto found;
1522                 }
1523
1524                 i -= k;
1525                 t += k;
1526                 a = le64toh(o->entry_array.next_entry_array_offset);
1527         }
1528
1529         return 0;
1530
1531 found:
1532         /* Let's cache this item for the next invocation */
1533         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1534
1535         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1536         if (r < 0)
1537                 return r;
1538
1539         if (ret)
1540                 *ret = o;
1541
1542         if (offset)
1543                 *offset = p;
1544
1545         return 1;
1546 }
1547
1548 static int generic_array_get_plus_one(
1549                 JournalFile *f,
1550                 uint64_t extra,
1551                 uint64_t first,
1552                 uint64_t i,
1553                 Object **ret, uint64_t *offset) {
1554
1555         Object *o;
1556
1557         assert(f);
1558
1559         if (i == 0) {
1560                 int r;
1561
1562                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1563                 if (r < 0)
1564                         return r;
1565
1566                 if (ret)
1567                         *ret = o;
1568
1569                 if (offset)
1570                         *offset = extra;
1571
1572                 return 1;
1573         }
1574
1575         return generic_array_get(f, first, i-1, ret, offset);
1576 }
1577
1578 enum {
1579         TEST_FOUND,
1580         TEST_LEFT,
1581         TEST_RIGHT
1582 };
1583
1584 static int generic_array_bisect(
1585                 JournalFile *f,
1586                 uint64_t first,
1587                 uint64_t n,
1588                 uint64_t needle,
1589                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1590                 direction_t direction,
1591                 Object **ret,
1592                 uint64_t *offset,
1593                 uint64_t *idx) {
1594
1595         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1596         bool subtract_one = false;
1597         Object *o, *array = NULL;
1598         int r;
1599         ChainCacheItem *ci;
1600
1601         assert(f);
1602         assert(test_object);
1603
1604         /* Start with the first array in the chain */
1605         a = first;
1606
1607         ci = ordered_hashmap_get(f->chain_cache, &first);
1608         if (ci && n > ci->total) {
1609                 /* Ah, we have iterated this bisection array chain
1610                  * previously! Let's see if we can skip ahead in the
1611                  * chain, as far as the last time. But we can't jump
1612                  * backwards in the chain, so let's check that
1613                  * first. */
1614
1615                 r = test_object(f, ci->begin, needle);
1616                 if (r < 0)
1617                         return r;
1618
1619                 if (r == TEST_LEFT) {
1620                         /* OK, what we are looking for is right of the
1621                          * begin of this EntryArray, so let's jump
1622                          * straight to previously cached array in the
1623                          * chain */
1624
1625                         a = ci->array;
1626                         n -= ci->total;
1627                         t = ci->total;
1628                         last_index = ci->last_index;
1629                 }
1630         }
1631
1632         while (a > 0) {
1633                 uint64_t left, right, k, lp;
1634
1635                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1636                 if (r < 0)
1637                         return r;
1638
1639                 k = journal_file_entry_array_n_items(array);
1640                 right = MIN(k, n);
1641                 if (right <= 0)
1642                         return 0;
1643
1644                 i = right - 1;
1645                 lp = p = le64toh(array->entry_array.items[i]);
1646                 if (p <= 0)
1647                         return -EBADMSG;
1648
1649                 r = test_object(f, p, needle);
1650                 if (r < 0)
1651                         return r;
1652
1653                 if (r == TEST_FOUND)
1654                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1655
1656                 if (r == TEST_RIGHT) {
1657                         left = 0;
1658                         right -= 1;
1659
1660                         if (last_index != (uint64_t) -1) {
1661                                 assert(last_index <= right);
1662
1663                                 /* If we cached the last index we
1664                                  * looked at, let's try to not to jump
1665                                  * too wildly around and see if we can
1666                                  * limit the range to look at early to
1667                                  * the immediate neighbors of the last
1668                                  * index we looked at. */
1669
1670                                 if (last_index > 0) {
1671                                         uint64_t x = last_index - 1;
1672
1673                                         p = le64toh(array->entry_array.items[x]);
1674                                         if (p <= 0)
1675                                                 return -EBADMSG;
1676
1677                                         r = test_object(f, p, needle);
1678                                         if (r < 0)
1679                                                 return r;
1680
1681                                         if (r == TEST_FOUND)
1682                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1683
1684                                         if (r == TEST_RIGHT)
1685                                                 right = x;
1686                                         else
1687                                                 left = x + 1;
1688                                 }
1689
1690                                 if (last_index < right) {
1691                                         uint64_t y = last_index + 1;
1692
1693                                         p = le64toh(array->entry_array.items[y]);
1694                                         if (p <= 0)
1695                                                 return -EBADMSG;
1696
1697                                         r = test_object(f, p, needle);
1698                                         if (r < 0)
1699                                                 return r;
1700
1701                                         if (r == TEST_FOUND)
1702                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1703
1704                                         if (r == TEST_RIGHT)
1705                                                 right = y;
1706                                         else
1707                                                 left = y + 1;
1708                                 }
1709                         }
1710
1711                         for (;;) {
1712                                 if (left == right) {
1713                                         if (direction == DIRECTION_UP)
1714                                                 subtract_one = true;
1715
1716                                         i = left;
1717                                         goto found;
1718                                 }
1719
1720                                 assert(left < right);
1721                                 i = (left + right) / 2;
1722
1723                                 p = le64toh(array->entry_array.items[i]);
1724                                 if (p <= 0)
1725                                         return -EBADMSG;
1726
1727                                 r = test_object(f, p, needle);
1728                                 if (r < 0)
1729                                         return r;
1730
1731                                 if (r == TEST_FOUND)
1732                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1733
1734                                 if (r == TEST_RIGHT)
1735                                         right = i;
1736                                 else
1737                                         left = i + 1;
1738                         }
1739                 }
1740
1741                 if (k >= n) {
1742                         if (direction == DIRECTION_UP) {
1743                                 i = n;
1744                                 subtract_one = true;
1745                                 goto found;
1746                         }
1747
1748                         return 0;
1749                 }
1750
1751                 last_p = lp;
1752
1753                 n -= k;
1754                 t += k;
1755                 last_index = (uint64_t) -1;
1756                 a = le64toh(array->entry_array.next_entry_array_offset);
1757         }
1758
1759         return 0;
1760
1761 found:
1762         if (subtract_one && t == 0 && i == 0)
1763                 return 0;
1764
1765         /* Let's cache this item for the next invocation */
1766         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1767
1768         if (subtract_one && i == 0)
1769                 p = last_p;
1770         else if (subtract_one)
1771                 p = le64toh(array->entry_array.items[i-1]);
1772         else
1773                 p = le64toh(array->entry_array.items[i]);
1774
1775         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1776         if (r < 0)
1777                 return r;
1778
1779         if (ret)
1780                 *ret = o;
1781
1782         if (offset)
1783                 *offset = p;
1784
1785         if (idx)
1786                 *idx = t + i + (subtract_one ? -1 : 0);
1787
1788         return 1;
1789 }
1790
1791 static int generic_array_bisect_plus_one(
1792                 JournalFile *f,
1793                 uint64_t extra,
1794                 uint64_t first,
1795                 uint64_t n,
1796                 uint64_t needle,
1797                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1798                 direction_t direction,
1799                 Object **ret,
1800                 uint64_t *offset,
1801                 uint64_t *idx) {
1802
1803         int r;
1804         bool step_back = false;
1805         Object *o;
1806
1807         assert(f);
1808         assert(test_object);
1809
1810         if (n <= 0)
1811                 return 0;
1812
1813         /* This bisects the array in object 'first', but first checks
1814          * an extra  */
1815         r = test_object(f, extra, needle);
1816         if (r < 0)
1817                 return r;
1818
1819         if (r == TEST_FOUND)
1820                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1821
1822         /* if we are looking with DIRECTION_UP then we need to first
1823            see if in the actual array there is a matching entry, and
1824            return the last one of that. But if there isn't any we need
1825            to return this one. Hence remember this, and return it
1826            below. */
1827         if (r == TEST_LEFT)
1828                 step_back = direction == DIRECTION_UP;
1829
1830         if (r == TEST_RIGHT) {
1831                 if (direction == DIRECTION_DOWN)
1832                         goto found;
1833                 else
1834                         return 0;
1835         }
1836
1837         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1838
1839         if (r == 0 && step_back)
1840                 goto found;
1841
1842         if (r > 0 && idx)
1843                 (*idx) ++;
1844
1845         return r;
1846
1847 found:
1848         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1849         if (r < 0)
1850                 return r;
1851
1852         if (ret)
1853                 *ret = o;
1854
1855         if (offset)
1856                 *offset = extra;
1857
1858         if (idx)
1859                 *idx = 0;
1860
1861         return 1;
1862 }
1863
1864 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1865         assert(f);
1866         assert(p > 0);
1867
1868         if (p == needle)
1869                 return TEST_FOUND;
1870         else if (p < needle)
1871                 return TEST_LEFT;
1872         else
1873                 return TEST_RIGHT;
1874 }
1875
1876 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1877         Object *o;
1878         int r;
1879
1880         assert(f);
1881         assert(p > 0);
1882
1883         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1884         if (r < 0)
1885                 return r;
1886
1887         if (le64toh(o->entry.seqnum) == needle)
1888                 return TEST_FOUND;
1889         else if (le64toh(o->entry.seqnum) < needle)
1890                 return TEST_LEFT;
1891         else
1892                 return TEST_RIGHT;
1893 }
1894
1895 int journal_file_move_to_entry_by_seqnum(
1896                 JournalFile *f,
1897                 uint64_t seqnum,
1898                 direction_t direction,
1899                 Object **ret,
1900                 uint64_t *offset) {
1901
1902         return generic_array_bisect(f,
1903                                     le64toh(f->header->entry_array_offset),
1904                                     le64toh(f->header->n_entries),
1905                                     seqnum,
1906                                     test_object_seqnum,
1907                                     direction,
1908                                     ret, offset, NULL);
1909 }
1910
1911 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1912         Object *o;
1913         int r;
1914
1915         assert(f);
1916         assert(p > 0);
1917
1918         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1919         if (r < 0)
1920                 return r;
1921
1922         if (le64toh(o->entry.realtime) == needle)
1923                 return TEST_FOUND;
1924         else if (le64toh(o->entry.realtime) < needle)
1925                 return TEST_LEFT;
1926         else
1927                 return TEST_RIGHT;
1928 }
1929
1930 int journal_file_move_to_entry_by_realtime(
1931                 JournalFile *f,
1932                 uint64_t realtime,
1933                 direction_t direction,
1934                 Object **ret,
1935                 uint64_t *offset) {
1936
1937         return generic_array_bisect(f,
1938                                     le64toh(f->header->entry_array_offset),
1939                                     le64toh(f->header->n_entries),
1940                                     realtime,
1941                                     test_object_realtime,
1942                                     direction,
1943                                     ret, offset, NULL);
1944 }
1945
1946 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1947         Object *o;
1948         int r;
1949
1950         assert(f);
1951         assert(p > 0);
1952
1953         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1954         if (r < 0)
1955                 return r;
1956
1957         if (le64toh(o->entry.monotonic) == needle)
1958                 return TEST_FOUND;
1959         else if (le64toh(o->entry.monotonic) < needle)
1960                 return TEST_LEFT;
1961         else
1962                 return TEST_RIGHT;
1963 }
1964
1965 static int find_data_object_by_boot_id(
1966                 JournalFile *f,
1967                 sd_id128_t boot_id,
1968                 Object **o,
1969                 uint64_t *b) {
1970
1971         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1972
1973         sd_id128_to_string(boot_id, t + 9);
1974         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1975 }
1976
1977 int journal_file_move_to_entry_by_monotonic(
1978                 JournalFile *f,
1979                 sd_id128_t boot_id,
1980                 uint64_t monotonic,
1981                 direction_t direction,
1982                 Object **ret,
1983                 uint64_t *offset) {
1984
1985         Object *o;
1986         int r;
1987
1988         assert(f);
1989
1990         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1991         if (r < 0)
1992                 return r;
1993         if (r == 0)
1994                 return -ENOENT;
1995
1996         return generic_array_bisect_plus_one(f,
1997                                              le64toh(o->data.entry_offset),
1998                                              le64toh(o->data.entry_array_offset),
1999                                              le64toh(o->data.n_entries),
2000                                              monotonic,
2001                                              test_object_monotonic,
2002                                              direction,
2003                                              ret, offset, NULL);
2004 }
2005
2006 void journal_file_reset_location(JournalFile *f) {
2007         f->location_type = LOCATION_HEAD;
2008         f->current_offset = 0;
2009         f->current_seqnum = 0;
2010         f->current_realtime = 0;
2011         f->current_monotonic = 0;
2012         zero(f->current_boot_id);
2013         f->current_xor_hash = 0;
2014 }
2015
2016 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2017         f->location_type = LOCATION_SEEK;
2018         f->current_offset = offset;
2019         f->current_seqnum = le64toh(o->entry.seqnum);
2020         f->current_realtime = le64toh(o->entry.realtime);
2021         f->current_monotonic = le64toh(o->entry.monotonic);
2022         f->current_boot_id = o->entry.boot_id;
2023         f->current_xor_hash = le64toh(o->entry.xor_hash);
2024 }
2025
2026 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2027         assert(af);
2028         assert(bf);
2029         assert(af->location_type == LOCATION_SEEK);
2030         assert(bf->location_type == LOCATION_SEEK);
2031
2032         /* If contents and timestamps match, these entries are
2033          * identical, even if the seqnum does not match */
2034         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2035             af->current_monotonic == bf->current_monotonic &&
2036             af->current_realtime == bf->current_realtime &&
2037             af->current_xor_hash == bf->current_xor_hash)
2038                 return 0;
2039
2040         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2041
2042                 /* If this is from the same seqnum source, compare
2043                  * seqnums */
2044                 if (af->current_seqnum < bf->current_seqnum)
2045                         return -1;
2046                 if (af->current_seqnum > bf->current_seqnum)
2047                         return 1;
2048
2049                 /* Wow! This is weird, different data but the same
2050                  * seqnums? Something is borked, but let's make the
2051                  * best of it and compare by time. */
2052         }
2053
2054         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2055
2056                 /* If the boot id matches, compare monotonic time */
2057                 if (af->current_monotonic < bf->current_monotonic)
2058                         return -1;
2059                 if (af->current_monotonic > bf->current_monotonic)
2060                         return 1;
2061         }
2062
2063         /* Otherwise, compare UTC time */
2064         if (af->current_realtime < bf->current_realtime)
2065                 return -1;
2066         if (af->current_realtime > bf->current_realtime)
2067                 return 1;
2068
2069         /* Finally, compare by contents */
2070         if (af->current_xor_hash < bf->current_xor_hash)
2071                 return -1;
2072         if (af->current_xor_hash > bf->current_xor_hash)
2073                 return 1;
2074
2075         return 0;
2076 }
2077
2078 int journal_file_next_entry(
2079                 JournalFile *f,
2080                 uint64_t p,
2081                 direction_t direction,
2082                 Object **ret, uint64_t *offset) {
2083
2084         uint64_t i, n, ofs;
2085         int r;
2086
2087         assert(f);
2088
2089         n = le64toh(f->header->n_entries);
2090         if (n <= 0)
2091                 return 0;
2092
2093         if (p == 0)
2094                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2095         else {
2096                 r = generic_array_bisect(f,
2097                                          le64toh(f->header->entry_array_offset),
2098                                          le64toh(f->header->n_entries),
2099                                          p,
2100                                          test_object_offset,
2101                                          DIRECTION_DOWN,
2102                                          NULL, NULL,
2103                                          &i);
2104                 if (r <= 0)
2105                         return r;
2106
2107                 if (direction == DIRECTION_DOWN) {
2108                         if (i >= n - 1)
2109                                 return 0;
2110
2111                         i++;
2112                 } else {
2113                         if (i <= 0)
2114                                 return 0;
2115
2116                         i--;
2117                 }
2118         }
2119
2120         /* And jump to it */
2121         r = generic_array_get(f,
2122                               le64toh(f->header->entry_array_offset),
2123                               i,
2124                               ret, &ofs);
2125         if (r <= 0)
2126                 return r;
2127
2128         if (p > 0 &&
2129             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2130                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2131                           f->path, i);
2132                 return -EBADMSG;
2133         }
2134
2135         if (offset)
2136                 *offset = ofs;
2137
2138         return 1;
2139 }
2140
2141 int journal_file_next_entry_for_data(
2142                 JournalFile *f,
2143                 Object *o, uint64_t p,
2144                 uint64_t data_offset,
2145                 direction_t direction,
2146                 Object **ret, uint64_t *offset) {
2147
2148         uint64_t n, i;
2149         int r;
2150         Object *d;
2151
2152         assert(f);
2153         assert(p > 0 || !o);
2154
2155         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2156         if (r < 0)
2157                 return r;
2158
2159         n = le64toh(d->data.n_entries);
2160         if (n <= 0)
2161                 return n;
2162
2163         if (!o)
2164                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2165         else {
2166                 if (o->object.type != OBJECT_ENTRY)
2167                         return -EINVAL;
2168
2169                 r = generic_array_bisect_plus_one(f,
2170                                                   le64toh(d->data.entry_offset),
2171                                                   le64toh(d->data.entry_array_offset),
2172                                                   le64toh(d->data.n_entries),
2173                                                   p,
2174                                                   test_object_offset,
2175                                                   DIRECTION_DOWN,
2176                                                   NULL, NULL,
2177                                                   &i);
2178
2179                 if (r <= 0)
2180                         return r;
2181
2182                 if (direction == DIRECTION_DOWN) {
2183                         if (i >= n - 1)
2184                                 return 0;
2185
2186                         i++;
2187                 } else {
2188                         if (i <= 0)
2189                                 return 0;
2190
2191                         i--;
2192                 }
2193
2194         }
2195
2196         return generic_array_get_plus_one(f,
2197                                           le64toh(d->data.entry_offset),
2198                                           le64toh(d->data.entry_array_offset),
2199                                           i,
2200                                           ret, offset);
2201 }
2202
2203 int journal_file_move_to_entry_by_offset_for_data(
2204                 JournalFile *f,
2205                 uint64_t data_offset,
2206                 uint64_t p,
2207                 direction_t direction,
2208                 Object **ret, uint64_t *offset) {
2209
2210         int r;
2211         Object *d;
2212
2213         assert(f);
2214
2215         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2216         if (r < 0)
2217                 return r;
2218
2219         return generic_array_bisect_plus_one(f,
2220                                              le64toh(d->data.entry_offset),
2221                                              le64toh(d->data.entry_array_offset),
2222                                              le64toh(d->data.n_entries),
2223                                              p,
2224                                              test_object_offset,
2225                                              direction,
2226                                              ret, offset, NULL);
2227 }
2228
2229 int journal_file_move_to_entry_by_monotonic_for_data(
2230                 JournalFile *f,
2231                 uint64_t data_offset,
2232                 sd_id128_t boot_id,
2233                 uint64_t monotonic,
2234                 direction_t direction,
2235                 Object **ret, uint64_t *offset) {
2236
2237         Object *o, *d;
2238         int r;
2239         uint64_t b, z;
2240
2241         assert(f);
2242
2243         /* First, seek by time */
2244         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2245         if (r < 0)
2246                 return r;
2247         if (r == 0)
2248                 return -ENOENT;
2249
2250         r = generic_array_bisect_plus_one(f,
2251                                           le64toh(o->data.entry_offset),
2252                                           le64toh(o->data.entry_array_offset),
2253                                           le64toh(o->data.n_entries),
2254                                           monotonic,
2255                                           test_object_monotonic,
2256                                           direction,
2257                                           NULL, &z, NULL);
2258         if (r <= 0)
2259                 return r;
2260
2261         /* And now, continue seeking until we find an entry that
2262          * exists in both bisection arrays */
2263
2264         for (;;) {
2265                 Object *qo;
2266                 uint64_t p, q;
2267
2268                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2269                 if (r < 0)
2270                         return r;
2271
2272                 r = generic_array_bisect_plus_one(f,
2273                                                   le64toh(d->data.entry_offset),
2274                                                   le64toh(d->data.entry_array_offset),
2275                                                   le64toh(d->data.n_entries),
2276                                                   z,
2277                                                   test_object_offset,
2278                                                   direction,
2279                                                   NULL, &p, NULL);
2280                 if (r <= 0)
2281                         return r;
2282
2283                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2284                 if (r < 0)
2285                         return r;
2286
2287                 r = generic_array_bisect_plus_one(f,
2288                                                   le64toh(o->data.entry_offset),
2289                                                   le64toh(o->data.entry_array_offset),
2290                                                   le64toh(o->data.n_entries),
2291                                                   p,
2292                                                   test_object_offset,
2293                                                   direction,
2294                                                   &qo, &q, NULL);
2295
2296                 if (r <= 0)
2297                         return r;
2298
2299                 if (p == q) {
2300                         if (ret)
2301                                 *ret = qo;
2302                         if (offset)
2303                                 *offset = q;
2304
2305                         return 1;
2306                 }
2307
2308                 z = q;
2309         }
2310 }
2311
2312 int journal_file_move_to_entry_by_seqnum_for_data(
2313                 JournalFile *f,
2314                 uint64_t data_offset,
2315                 uint64_t seqnum,
2316                 direction_t direction,
2317                 Object **ret, uint64_t *offset) {
2318
2319         Object *d;
2320         int r;
2321
2322         assert(f);
2323
2324         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2325         if (r < 0)
2326                 return r;
2327
2328         return generic_array_bisect_plus_one(f,
2329                                              le64toh(d->data.entry_offset),
2330                                              le64toh(d->data.entry_array_offset),
2331                                              le64toh(d->data.n_entries),
2332                                              seqnum,
2333                                              test_object_seqnum,
2334                                              direction,
2335                                              ret, offset, NULL);
2336 }
2337
2338 int journal_file_move_to_entry_by_realtime_for_data(
2339                 JournalFile *f,
2340                 uint64_t data_offset,
2341                 uint64_t realtime,
2342                 direction_t direction,
2343                 Object **ret, uint64_t *offset) {
2344
2345         Object *d;
2346         int r;
2347
2348         assert(f);
2349
2350         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2351         if (r < 0)
2352                 return r;
2353
2354         return generic_array_bisect_plus_one(f,
2355                                              le64toh(d->data.entry_offset),
2356                                              le64toh(d->data.entry_array_offset),
2357                                              le64toh(d->data.n_entries),
2358                                              realtime,
2359                                              test_object_realtime,
2360                                              direction,
2361                                              ret, offset, NULL);
2362 }
2363
2364 void journal_file_dump(JournalFile *f) {
2365         Object *o;
2366         int r;
2367         uint64_t p;
2368
2369         assert(f);
2370
2371         journal_file_print_header(f);
2372
2373         p = le64toh(f->header->header_size);
2374         while (p != 0) {
2375                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2376                 if (r < 0)
2377                         goto fail;
2378
2379                 switch (o->object.type) {
2380
2381                 case OBJECT_UNUSED:
2382                         printf("Type: OBJECT_UNUSED\n");
2383                         break;
2384
2385                 case OBJECT_DATA:
2386                         printf("Type: OBJECT_DATA\n");
2387                         break;
2388
2389                 case OBJECT_FIELD:
2390                         printf("Type: OBJECT_FIELD\n");
2391                         break;
2392
2393                 case OBJECT_ENTRY:
2394                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2395                                le64toh(o->entry.seqnum),
2396                                le64toh(o->entry.monotonic),
2397                                le64toh(o->entry.realtime));
2398                         break;
2399
2400                 case OBJECT_FIELD_HASH_TABLE:
2401                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2402                         break;
2403
2404                 case OBJECT_DATA_HASH_TABLE:
2405                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2406                         break;
2407
2408                 case OBJECT_ENTRY_ARRAY:
2409                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2410                         break;
2411
2412                 case OBJECT_TAG:
2413                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2414                                le64toh(o->tag.seqnum),
2415                                le64toh(o->tag.epoch));
2416                         break;
2417
2418                 default:
2419                         printf("Type: unknown (%i)\n", o->object.type);
2420                         break;
2421                 }
2422
2423                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2424                         printf("Flags: %s\n",
2425                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2426
2427                 if (p == le64toh(f->header->tail_object_offset))
2428                         p = 0;
2429                 else
2430                         p = p + ALIGN64(le64toh(o->object.size));
2431         }
2432
2433         return;
2434 fail:
2435         log_error("File corrupt");
2436 }
2437
2438 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2439         const char *x;
2440
2441         x = format_timestamp(buf, l, t);
2442         if (x)
2443                 return x;
2444         return " --- ";
2445 }
2446
2447 void journal_file_print_header(JournalFile *f) {
2448         char a[33], b[33], c[33], d[33];
2449         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2450         struct stat st;
2451         char bytes[FORMAT_BYTES_MAX];
2452
2453         assert(f);
2454
2455         printf("File Path: %s\n"
2456                "File ID: %s\n"
2457                "Machine ID: %s\n"
2458                "Boot ID: %s\n"
2459                "Sequential Number ID: %s\n"
2460                "State: %s\n"
2461                "Compatible Flags:%s%s\n"
2462                "Incompatible Flags:%s%s%s\n"
2463                "Header size: %"PRIu64"\n"
2464                "Arena size: %"PRIu64"\n"
2465                "Data Hash Table Size: %"PRIu64"\n"
2466                "Field Hash Table Size: %"PRIu64"\n"
2467                "Rotate Suggested: %s\n"
2468                "Head Sequential Number: %"PRIu64"\n"
2469                "Tail Sequential Number: %"PRIu64"\n"
2470                "Head Realtime Timestamp: %s\n"
2471                "Tail Realtime Timestamp: %s\n"
2472                "Tail Monotonic Timestamp: %s\n"
2473                "Objects: %"PRIu64"\n"
2474                "Entry Objects: %"PRIu64"\n",
2475                f->path,
2476                sd_id128_to_string(f->header->file_id, a),
2477                sd_id128_to_string(f->header->machine_id, b),
2478                sd_id128_to_string(f->header->boot_id, c),
2479                sd_id128_to_string(f->header->seqnum_id, d),
2480                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2481                f->header->state == STATE_ONLINE ? "ONLINE" :
2482                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2483                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2484                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2485                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2486                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2487                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2488                le64toh(f->header->header_size),
2489                le64toh(f->header->arena_size),
2490                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2491                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2492                yes_no(journal_file_rotate_suggested(f, 0)),
2493                le64toh(f->header->head_entry_seqnum),
2494                le64toh(f->header->tail_entry_seqnum),
2495                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2496                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2497                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2498                le64toh(f->header->n_objects),
2499                le64toh(f->header->n_entries));
2500
2501         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2502                 printf("Data Objects: %"PRIu64"\n"
2503                        "Data Hash Table Fill: %.1f%%\n",
2504                        le64toh(f->header->n_data),
2505                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2506
2507         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2508                 printf("Field Objects: %"PRIu64"\n"
2509                        "Field Hash Table Fill: %.1f%%\n",
2510                        le64toh(f->header->n_fields),
2511                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2512
2513         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2514                 printf("Tag Objects: %"PRIu64"\n",
2515                        le64toh(f->header->n_tags));
2516         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2517                 printf("Entry Array Objects: %"PRIu64"\n",
2518                        le64toh(f->header->n_entry_arrays));
2519
2520         if (fstat(f->fd, &st) >= 0)
2521                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2522 }
2523
2524 int journal_file_open(
2525                 const char *fname,
2526                 int flags,
2527                 mode_t mode,
2528                 bool compress,
2529                 bool seal,
2530                 JournalMetrics *metrics,
2531                 MMapCache *mmap_cache,
2532                 JournalFile *template,
2533                 JournalFile **ret) {
2534
2535         bool newly_created = false;
2536         JournalFile *f;
2537         void *h;
2538         int r;
2539
2540         assert(fname);
2541         assert(ret);
2542
2543         if ((flags & O_ACCMODE) != O_RDONLY &&
2544             (flags & O_ACCMODE) != O_RDWR)
2545                 return -EINVAL;
2546
2547         if (!endswith(fname, ".journal") &&
2548             !endswith(fname, ".journal~"))
2549                 return -EINVAL;
2550
2551         f = new0(JournalFile, 1);
2552         if (!f)
2553                 return -ENOMEM;
2554
2555         f->fd = -1;
2556         f->mode = mode;
2557
2558         f->flags = flags;
2559         f->prot = prot_from_flags(flags);
2560         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2561 #if defined(HAVE_LZ4)
2562         f->compress_lz4 = compress;
2563 #elif defined(HAVE_XZ)
2564         f->compress_xz = compress;
2565 #endif
2566 #ifdef HAVE_GCRYPT
2567         f->seal = seal;
2568 #endif
2569
2570         if (mmap_cache)
2571                 f->mmap = mmap_cache_ref(mmap_cache);
2572         else {
2573                 f->mmap = mmap_cache_new();
2574                 if (!f->mmap) {
2575                         r = -ENOMEM;
2576                         goto fail;
2577                 }
2578         }
2579
2580         f->path = strdup(fname);
2581         if (!f->path) {
2582                 r = -ENOMEM;
2583                 goto fail;
2584         }
2585
2586         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2587         if (!f->chain_cache) {
2588                 r = -ENOMEM;
2589                 goto fail;
2590         }
2591
2592         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2593         if (f->fd < 0) {
2594                 r = -errno;
2595                 goto fail;
2596         }
2597
2598         r = journal_file_fstat(f);
2599         if (r < 0)
2600                 goto fail;
2601
2602         if (f->last_stat.st_size == 0 && f->writable) {
2603
2604                 /* Before we write anything, turn off COW logic. Given
2605                  * our write pattern that is quite unfriendly to COW
2606                  * file systems this should greatly improve
2607                  * performance on COW file systems, such as btrfs, at
2608                  * the expense of data integrity features (which
2609                  * shouldn't be too bad, given that we do our own
2610                  * checksumming). */
2611                 r = chattr_fd(f->fd, true, FS_NOCOW_FL);
2612                 if (r < 0 && r != -ENOTTY)
2613                         log_warning_errno(r, "Failed to set file attributes: %m");
2614
2615                 /* Let's attach the creation time to the journal file,
2616                  * so that the vacuuming code knows the age of this
2617                  * file even if the file might end up corrupted one
2618                  * day... Ideally we'd just use the creation time many
2619                  * file systems maintain for each file, but there is
2620                  * currently no usable API to query this, hence let's
2621                  * emulate this via extended attributes. If extended
2622                  * attributes are not supported we'll just skip this,
2623                  * and rely solely on mtime/atime/ctime of the file. */
2624
2625                 fd_setcrtime(f->fd, 0);
2626
2627 #ifdef HAVE_GCRYPT
2628                 /* Try to load the FSPRG state, and if we can't, then
2629                  * just don't do sealing */
2630                 if (f->seal) {
2631                         r = journal_file_fss_load(f);
2632                         if (r < 0)
2633                                 f->seal = false;
2634                 }
2635 #endif
2636
2637                 r = journal_file_init_header(f, template);
2638                 if (r < 0)
2639                         goto fail;
2640
2641                 r = journal_file_fstat(f);
2642                 if (r < 0)
2643                         goto fail;
2644
2645                 newly_created = true;
2646         }
2647
2648         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2649                 r = -EIO;
2650                 goto fail;
2651         }
2652
2653         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2654         if (r < 0) {
2655                 r = -errno;
2656                 goto fail;
2657         }
2658
2659         f->header = h;
2660
2661         if (!newly_created) {
2662                 r = journal_file_verify_header(f);
2663                 if (r < 0)
2664                         goto fail;
2665         }
2666
2667 #ifdef HAVE_GCRYPT
2668         if (!newly_created && f->writable) {
2669                 r = journal_file_fss_load(f);
2670                 if (r < 0)
2671                         goto fail;
2672         }
2673 #endif
2674
2675         if (f->writable) {
2676                 if (metrics) {
2677                         journal_default_metrics(metrics, f->fd);
2678                         f->metrics = *metrics;
2679                 } else if (template)
2680                         f->metrics = template->metrics;
2681
2682                 r = journal_file_refresh_header(f);
2683                 if (r < 0)
2684                         goto fail;
2685         }
2686
2687 #ifdef HAVE_GCRYPT
2688         r = journal_file_hmac_setup(f);
2689         if (r < 0)
2690                 goto fail;
2691 #endif
2692
2693         if (newly_created) {
2694                 r = journal_file_setup_field_hash_table(f);
2695                 if (r < 0)
2696                         goto fail;
2697
2698                 r = journal_file_setup_data_hash_table(f);
2699                 if (r < 0)
2700                         goto fail;
2701
2702 #ifdef HAVE_GCRYPT
2703                 r = journal_file_append_first_tag(f);
2704                 if (r < 0)
2705                         goto fail;
2706 #endif
2707         }
2708
2709         r = journal_file_map_field_hash_table(f);
2710         if (r < 0)
2711                 goto fail;
2712
2713         r = journal_file_map_data_hash_table(f);
2714         if (r < 0)
2715                 goto fail;
2716
2717         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2718                 r = -EIO;
2719                 goto fail;
2720         }
2721
2722         *ret = f;
2723         return 0;
2724
2725 fail:
2726         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2727                 r = -EIO;
2728
2729         journal_file_close(f);
2730
2731         return r;
2732 }
2733
2734 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2735         _cleanup_free_ char *p = NULL;
2736         size_t l;
2737         JournalFile *old_file, *new_file = NULL;
2738         int r;
2739
2740         assert(f);
2741         assert(*f);
2742
2743         old_file = *f;
2744
2745         if (!old_file->writable)
2746                 return -EINVAL;
2747
2748         if (!endswith(old_file->path, ".journal"))
2749                 return -EINVAL;
2750
2751         l = strlen(old_file->path);
2752         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2753                      (int) l - 8, old_file->path,
2754                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2755                      le64toh((*f)->header->head_entry_seqnum),
2756                      le64toh((*f)->header->head_entry_realtime));
2757         if (r < 0)
2758                 return -ENOMEM;
2759
2760         /* Try to rename the file to the archived version. If the file
2761          * already was deleted, we'll get ENOENT, let's ignore that
2762          * case. */
2763         r = rename(old_file->path, p);
2764         if (r < 0 && errno != ENOENT)
2765                 return -errno;
2766
2767         old_file->header->state = STATE_ARCHIVED;
2768
2769         /* Currently, btrfs is not very good with out write patterns
2770          * and fragments heavily. Let's defrag our journal files when
2771          * we archive them */
2772         old_file->defrag_on_close = true;
2773
2774         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2775         journal_file_close(old_file);
2776
2777         *f = new_file;
2778         return r;
2779 }
2780
2781 int journal_file_open_reliably(
2782                 const char *fname,
2783                 int flags,
2784                 mode_t mode,
2785                 bool compress,
2786                 bool seal,
2787                 JournalMetrics *metrics,
2788                 MMapCache *mmap_cache,
2789                 JournalFile *template,
2790                 JournalFile **ret) {
2791
2792         int r;
2793         size_t l;
2794         _cleanup_free_ char *p = NULL;
2795
2796         r = journal_file_open(fname, flags, mode, compress, seal,
2797                               metrics, mmap_cache, template, ret);
2798         if (r != -EBADMSG && /* corrupted */
2799             r != -ENODATA && /* truncated */
2800             r != -EHOSTDOWN && /* other machine */
2801             r != -EPROTONOSUPPORT && /* incompatible feature */
2802             r != -EBUSY && /* unclean shutdown */
2803             r != -ESHUTDOWN && /* already archived */
2804             r != -EIO && /* IO error, including SIGBUS on mmap */
2805             r != -EIDRM /* File has been deleted */)
2806                 return r;
2807
2808         if ((flags & O_ACCMODE) == O_RDONLY)
2809                 return r;
2810
2811         if (!(flags & O_CREAT))
2812                 return r;
2813
2814         if (!endswith(fname, ".journal"))
2815                 return r;
2816
2817         /* The file is corrupted. Rotate it away and try it again (but only once) */
2818
2819         l = strlen(fname);
2820         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2821                      (int) l - 8, fname,
2822                      (unsigned long long) now(CLOCK_REALTIME),
2823                      random_u64()) < 0)
2824                 return -ENOMEM;
2825
2826         r = rename(fname, p);
2827         if (r < 0)
2828                 return -errno;
2829
2830         /* btrfs doesn't cope well with our write pattern and
2831          * fragments heavily. Let's defrag all files we rotate */
2832
2833         (void) chattr_path(p, false, FS_NOCOW_FL);
2834         (void) btrfs_defrag(p);
2835
2836         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2837
2838         return journal_file_open(fname, flags, mode, compress, seal,
2839                                  metrics, mmap_cache, template, ret);
2840 }
2841
2842 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2843         uint64_t i, n;
2844         uint64_t q, xor_hash = 0;
2845         int r;
2846         EntryItem *items;
2847         dual_timestamp ts;
2848
2849         assert(from);
2850         assert(to);
2851         assert(o);
2852         assert(p);
2853
2854         if (!to->writable)
2855                 return -EPERM;
2856
2857         ts.monotonic = le64toh(o->entry.monotonic);
2858         ts.realtime = le64toh(o->entry.realtime);
2859
2860         n = journal_file_entry_n_items(o);
2861         /* alloca() can't take 0, hence let's allocate at least one */
2862         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2863
2864         for (i = 0; i < n; i++) {
2865                 uint64_t l, h;
2866                 le64_t le_hash;
2867                 size_t t;
2868                 void *data;
2869                 Object *u;
2870
2871                 q = le64toh(o->entry.items[i].object_offset);
2872                 le_hash = o->entry.items[i].hash;
2873
2874                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2875                 if (r < 0)
2876                         return r;
2877
2878                 if (le_hash != o->data.hash)
2879                         return -EBADMSG;
2880
2881                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2882                 t = (size_t) l;
2883
2884                 /* We hit the limit on 32bit machines */
2885                 if ((uint64_t) t != l)
2886                         return -E2BIG;
2887
2888                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2889 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2890                         size_t rsize;
2891
2892                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2893                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2894                         if (r < 0)
2895                                 return r;
2896
2897                         data = from->compress_buffer;
2898                         l = rsize;
2899 #else
2900                         return -EPROTONOSUPPORT;
2901 #endif
2902                 } else
2903                         data = o->data.payload;
2904
2905                 r = journal_file_append_data(to, data, l, &u, &h);
2906                 if (r < 0)
2907                         return r;
2908
2909                 xor_hash ^= le64toh(u->data.hash);
2910                 items[i].object_offset = htole64(h);
2911                 items[i].hash = u->data.hash;
2912
2913                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2914                 if (r < 0)
2915                         return r;
2916         }
2917
2918         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2919
2920         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2921                 return -EIO;
2922
2923         return r;
2924 }
2925
2926 void journal_default_metrics(JournalMetrics *m, int fd) {
2927         uint64_t fs_size = 0;
2928         struct statvfs ss;
2929         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2930
2931         assert(m);
2932         assert(fd >= 0);
2933
2934         if (fstatvfs(fd, &ss) >= 0)
2935                 fs_size = ss.f_frsize * ss.f_blocks;
2936
2937         if (m->max_use == (uint64_t) -1) {
2938
2939                 if (fs_size > 0) {
2940                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2941
2942                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2943                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2944
2945                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2946                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2947                 } else
2948                         m->max_use = DEFAULT_MAX_USE_LOWER;
2949         } else {
2950                 m->max_use = PAGE_ALIGN(m->max_use);
2951
2952                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2953                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2954         }
2955
2956         if (m->max_size == (uint64_t) -1) {
2957                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2958
2959                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2960                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2961         } else
2962                 m->max_size = PAGE_ALIGN(m->max_size);
2963
2964         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2965                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2966
2967         if (m->max_size*2 > m->max_use)
2968                 m->max_use = m->max_size*2;
2969
2970         if (m->min_size == (uint64_t) -1)
2971                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2972         else {
2973                 m->min_size = PAGE_ALIGN(m->min_size);
2974
2975                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2976                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2977
2978                 if (m->min_size > m->max_size)
2979                         m->max_size = m->min_size;
2980         }
2981
2982         if (m->keep_free == (uint64_t) -1) {
2983
2984                 if (fs_size > 0) {
2985                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2986
2987                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2988                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2989
2990                 } else
2991                         m->keep_free = DEFAULT_KEEP_FREE;
2992         }
2993
2994         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2995                   format_bytes(a, sizeof(a), m->max_use),
2996                   format_bytes(b, sizeof(b), m->max_size),
2997                   format_bytes(c, sizeof(c), m->min_size),
2998                   format_bytes(d, sizeof(d), m->keep_free));
2999 }
3000
3001 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3002         assert(f);
3003         assert(from || to);
3004
3005         if (from) {
3006                 if (f->header->head_entry_realtime == 0)
3007                         return -ENOENT;
3008
3009                 *from = le64toh(f->header->head_entry_realtime);
3010         }
3011
3012         if (to) {
3013                 if (f->header->tail_entry_realtime == 0)
3014                         return -ENOENT;
3015
3016                 *to = le64toh(f->header->tail_entry_realtime);
3017         }
3018
3019         return 1;
3020 }
3021
3022 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3023         Object *o;
3024         uint64_t p;
3025         int r;
3026
3027         assert(f);
3028         assert(from || to);
3029
3030         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3031         if (r <= 0)
3032                 return r;
3033
3034         if (le64toh(o->data.n_entries) <= 0)
3035                 return 0;
3036
3037         if (from) {
3038                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3039                 if (r < 0)
3040                         return r;
3041
3042                 *from = le64toh(o->entry.monotonic);
3043         }
3044
3045         if (to) {
3046                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3047                 if (r < 0)
3048                         return r;
3049
3050                 r = generic_array_get_plus_one(f,
3051                                                le64toh(o->data.entry_offset),
3052                                                le64toh(o->data.entry_array_offset),
3053                                                le64toh(o->data.n_entries)-1,
3054                                                &o, NULL);
3055                 if (r <= 0)
3056                         return r;
3057
3058                 *to = le64toh(o->entry.monotonic);
3059         }
3060
3061         return 1;
3062 }
3063
3064 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3065         assert(f);
3066
3067         /* If we gained new header fields we gained new features,
3068          * hence suggest a rotation */
3069         if (le64toh(f->header->header_size) < sizeof(Header)) {
3070                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3071                 return true;
3072         }
3073
3074         /* Let's check if the hash tables grew over a certain fill
3075          * level (75%, borrowing this value from Java's hash table
3076          * implementation), and if so suggest a rotation. To calculate
3077          * the fill level we need the n_data field, which only exists
3078          * in newer versions. */
3079
3080         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3081                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3082                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3083                                   f->path,
3084                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3085                                   le64toh(f->header->n_data),
3086                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3087                                   (unsigned long long) f->last_stat.st_size,
3088                                   f->last_stat.st_size / le64toh(f->header->n_data));
3089                         return true;
3090                 }
3091
3092         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3093                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3094                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3095                                   f->path,
3096                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3097                                   le64toh(f->header->n_fields),
3098                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3099                         return true;
3100                 }
3101
3102         /* Are the data objects properly indexed by field objects? */
3103         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3104             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3105             le64toh(f->header->n_data) > 0 &&
3106             le64toh(f->header->n_fields) == 0)
3107                 return true;
3108
3109         if (max_file_usec > 0) {
3110                 usec_t t, h;
3111
3112                 h = le64toh(f->header->head_entry_realtime);
3113                 t = now(CLOCK_REALTIME);
3114
3115                 if (h > 0 && t > h + max_file_usec)
3116                         return true;
3117         }
3118
3119         return false;
3120 }