chiark / gitweb /
remove unused includes
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <linux/fs.h>
30
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
35 #include "lookup3.h"
36 #include "compress.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47  * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55  * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59  * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
70
71 /* Reread fstat() of the file for detecting deletions at least this often */
72 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
73
74 /* The mmap context to use for the header we pick as one above the last defined typed */
75 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
76
77 static int journal_file_set_online(JournalFile *f) {
78         assert(f);
79
80         if (!f->writable)
81                 return -EPERM;
82
83         if (!(f->fd >= 0 && f->header))
84                 return -EINVAL;
85
86         if (mmap_cache_got_sigbus(f->mmap, f->fd))
87                 return -EIO;
88
89         switch(f->header->state) {
90                 case STATE_ONLINE:
91                         return 0;
92
93                 case STATE_OFFLINE:
94                         f->header->state = STATE_ONLINE;
95                         fsync(f->fd);
96                         return 0;
97
98                 default:
99                         return -EINVAL;
100         }
101 }
102
103 int journal_file_set_offline(JournalFile *f) {
104         assert(f);
105
106         if (!f->writable)
107                 return -EPERM;
108
109         if (!(f->fd >= 0 && f->header))
110                 return -EINVAL;
111
112         if (f->header->state != STATE_ONLINE)
113                 return 0;
114
115         fsync(f->fd);
116
117         if (mmap_cache_got_sigbus(f->mmap, f->fd))
118                 return -EIO;
119
120         f->header->state = STATE_OFFLINE;
121
122         if (mmap_cache_got_sigbus(f->mmap, f->fd))
123                 return -EIO;
124
125         fsync(f->fd);
126
127         return 0;
128 }
129
130 void journal_file_close(JournalFile *f) {
131         assert(f);
132
133 #ifdef HAVE_GCRYPT
134         /* Write the final tag */
135         if (f->seal && f->writable)
136                 journal_file_append_tag(f);
137 #endif
138
139         journal_file_set_offline(f);
140
141         if (f->mmap && f->fd >= 0)
142                 mmap_cache_close_fd(f->mmap, f->fd);
143
144         if (f->fd >= 0 && f->defrag_on_close) {
145
146                 /* Be friendly to btrfs: turn COW back on again now,
147                  * and defragment the file. We won't write to the file
148                  * ever again, hence remove all fragmentation, and
149                  * reenable all the good bits COW usually provides
150                  * (such as data checksumming). */
151
152                 (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
153                 (void) btrfs_defrag_fd(f->fd);
154         }
155
156         safe_close(f->fd);
157         free(f->path);
158
159         if (f->mmap)
160                 mmap_cache_unref(f->mmap);
161
162         ordered_hashmap_free_free(f->chain_cache);
163
164 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
165         free(f->compress_buffer);
166 #endif
167
168 #ifdef HAVE_GCRYPT
169         if (f->fss_file)
170                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
171         else if (f->fsprg_state)
172                 free(f->fsprg_state);
173
174         free(f->fsprg_seed);
175
176         if (f->hmac)
177                 gcry_md_close(f->hmac);
178 #endif
179
180         free(f);
181 }
182
183 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
184         Header h = {};
185         ssize_t k;
186         int r;
187
188         assert(f);
189
190         memcpy(h.signature, HEADER_SIGNATURE, 8);
191         h.header_size = htole64(ALIGN64(sizeof(h)));
192
193         h.incompatible_flags |= htole32(
194                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
195                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
196
197         h.compatible_flags = htole32(
198                 f->seal * HEADER_COMPATIBLE_SEALED);
199
200         r = sd_id128_randomize(&h.file_id);
201         if (r < 0)
202                 return r;
203
204         if (template) {
205                 h.seqnum_id = template->header->seqnum_id;
206                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
207         } else
208                 h.seqnum_id = h.file_id;
209
210         k = pwrite(f->fd, &h, sizeof(h), 0);
211         if (k < 0)
212                 return -errno;
213
214         if (k != sizeof(h))
215                 return -EIO;
216
217         return 0;
218 }
219
220 static int journal_file_refresh_header(JournalFile *f) {
221         sd_id128_t boot_id;
222         int r;
223
224         assert(f);
225
226         r = sd_id128_get_machine(&f->header->machine_id);
227         if (r < 0)
228                 return r;
229
230         r = sd_id128_get_boot(&boot_id);
231         if (r < 0)
232                 return r;
233
234         if (sd_id128_equal(boot_id, f->header->boot_id))
235                 f->tail_entry_monotonic_valid = true;
236
237         f->header->boot_id = boot_id;
238
239         r = journal_file_set_online(f);
240
241         /* Sync the online state to disk */
242         fsync(f->fd);
243
244         return r;
245 }
246
247 static int journal_file_verify_header(JournalFile *f) {
248         uint32_t flags;
249
250         assert(f);
251
252         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
253                 return -EBADMSG;
254
255         /* In both read and write mode we refuse to open files with
256          * incompatible flags we don't know */
257         flags = le32toh(f->header->incompatible_flags);
258         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
259                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
260                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
261                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
262                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
263                 if (flags)
264                         log_debug("Journal file %s uses incompatible flags %"PRIx32
265                                   " disabled at compilation time.", f->path, flags);
266                 return -EPROTONOSUPPORT;
267         }
268
269         /* When open for writing we refuse to open files with
270          * compatible flags, too */
271         flags = le32toh(f->header->compatible_flags);
272         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
273                 if (flags & ~HEADER_COMPATIBLE_ANY)
274                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
275                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
276                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
277                 if (flags)
278                         log_debug("Journal file %s uses compatible flags %"PRIx32
279                                   " disabled at compilation time.", f->path, flags);
280                 return -EPROTONOSUPPORT;
281         }
282
283         if (f->header->state >= _STATE_MAX)
284                 return -EBADMSG;
285
286         /* The first addition was n_data, so check that we are at least this large */
287         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
288                 return -EBADMSG;
289
290         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
291                 return -EBADMSG;
292
293         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
294                 return -ENODATA;
295
296         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
297                 return -ENODATA;
298
299         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
300             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
301             !VALID64(le64toh(f->header->tail_object_offset)) ||
302             !VALID64(le64toh(f->header->entry_array_offset)))
303                 return -ENODATA;
304
305         if (f->writable) {
306                 uint8_t state;
307                 sd_id128_t machine_id;
308                 int r;
309
310                 r = sd_id128_get_machine(&machine_id);
311                 if (r < 0)
312                         return r;
313
314                 if (!sd_id128_equal(machine_id, f->header->machine_id))
315                         return -EHOSTDOWN;
316
317                 state = f->header->state;
318
319                 if (state == STATE_ONLINE) {
320                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
321                         return -EBUSY;
322                 } else if (state == STATE_ARCHIVED)
323                         return -ESHUTDOWN;
324                 else if (state != STATE_OFFLINE) {
325                         log_debug("Journal file %s has unknown state %i.", f->path, state);
326                         return -EBUSY;
327                 }
328         }
329
330         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
331         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
332
333         f->seal = JOURNAL_HEADER_SEALED(f->header);
334
335         return 0;
336 }
337
338 static int journal_file_fstat(JournalFile *f) {
339         assert(f);
340         assert(f->fd >= 0);
341
342         if (fstat(f->fd, &f->last_stat) < 0)
343                 return -errno;
344
345         f->last_stat_usec = now(CLOCK_MONOTONIC);
346
347         /* Refuse appending to files that are already deleted */
348         if (f->last_stat.st_nlink <= 0)
349                 return -EIDRM;
350
351         return 0;
352 }
353
354 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
355         uint64_t old_size, new_size;
356         int r;
357
358         assert(f);
359
360         /* We assume that this file is not sparse, and we know that
361          * for sure, since we always call posix_fallocate()
362          * ourselves */
363
364         if (mmap_cache_got_sigbus(f->mmap, f->fd))
365                 return -EIO;
366
367         old_size =
368                 le64toh(f->header->header_size) +
369                 le64toh(f->header->arena_size);
370
371         new_size = PAGE_ALIGN(offset + size);
372         if (new_size < le64toh(f->header->header_size))
373                 new_size = le64toh(f->header->header_size);
374
375         if (new_size <= old_size) {
376
377                 /* We already pre-allocated enough space, but before
378                  * we write to it, let's check with fstat() if the
379                  * file got deleted, in order make sure we don't throw
380                  * away the data immediately. Don't check fstat() for
381                  * all writes though, but only once ever 10s. */
382
383                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
384                         return 0;
385
386                 return journal_file_fstat(f);
387         }
388
389         /* Allocate more space. */
390
391         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
392                 return -E2BIG;
393
394         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
395                 struct statvfs svfs;
396
397                 if (fstatvfs(f->fd, &svfs) >= 0) {
398                         uint64_t available;
399
400                         available = svfs.f_bfree * svfs.f_bsize;
401
402                         if (available >= f->metrics.keep_free)
403                                 available -= f->metrics.keep_free;
404                         else
405                                 available = 0;
406
407                         if (new_size - old_size > available)
408                                 return -E2BIG;
409                 }
410         }
411
412         /* Increase by larger blocks at once */
413         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
414         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
415                 new_size = f->metrics.max_size;
416
417         /* Note that the glibc fallocate() fallback is very
418            inefficient, hence we try to minimize the allocation area
419            as we can. */
420         r = posix_fallocate(f->fd, old_size, new_size - old_size);
421         if (r != 0)
422                 return -r;
423
424         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
425
426         return journal_file_fstat(f);
427 }
428
429 static unsigned type_to_context(ObjectType type) {
430         /* One context for each type, plus one catch-all for the rest */
431         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
432         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
433         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
434 }
435
436 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
437         int r;
438
439         assert(f);
440         assert(ret);
441
442         if (size <= 0)
443                 return -EINVAL;
444
445         /* Avoid SIGBUS on invalid accesses */
446         if (offset + size > (uint64_t) f->last_stat.st_size) {
447                 /* Hmm, out of range? Let's refresh the fstat() data
448                  * first, before we trust that check. */
449
450                 r = journal_file_fstat(f);
451                 if (r < 0)
452                         return r;
453
454                 if (offset + size > (uint64_t) f->last_stat.st_size)
455                         return -EADDRNOTAVAIL;
456         }
457
458         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
459 }
460
461 static uint64_t minimum_header_size(Object *o) {
462
463         static const uint64_t table[] = {
464                 [OBJECT_DATA] = sizeof(DataObject),
465                 [OBJECT_FIELD] = sizeof(FieldObject),
466                 [OBJECT_ENTRY] = sizeof(EntryObject),
467                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
468                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
469                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
470                 [OBJECT_TAG] = sizeof(TagObject),
471         };
472
473         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
474                 return sizeof(ObjectHeader);
475
476         return table[o->object.type];
477 }
478
479 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
480         int r;
481         void *t;
482         Object *o;
483         uint64_t s;
484
485         assert(f);
486         assert(ret);
487
488         /* Objects may only be located at multiple of 64 bit */
489         if (!VALID64(offset))
490                 return -EFAULT;
491
492         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
493         if (r < 0)
494                 return r;
495
496         o = (Object*) t;
497         s = le64toh(o->object.size);
498
499         if (s < sizeof(ObjectHeader))
500                 return -EBADMSG;
501
502         if (o->object.type <= OBJECT_UNUSED)
503                 return -EBADMSG;
504
505         if (s < minimum_header_size(o))
506                 return -EBADMSG;
507
508         if (type > OBJECT_UNUSED && o->object.type != type)
509                 return -EBADMSG;
510
511         if (s > sizeof(ObjectHeader)) {
512                 r = journal_file_move_to(f, type, false, offset, s, &t);
513                 if (r < 0)
514                         return r;
515
516                 o = (Object*) t;
517         }
518
519         *ret = o;
520         return 0;
521 }
522
523 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
524         uint64_t r;
525
526         assert(f);
527
528         r = le64toh(f->header->tail_entry_seqnum) + 1;
529
530         if (seqnum) {
531                 /* If an external seqnum counter was passed, we update
532                  * both the local and the external one, and set it to
533                  * the maximum of both */
534
535                 if (*seqnum + 1 > r)
536                         r = *seqnum + 1;
537
538                 *seqnum = r;
539         }
540
541         f->header->tail_entry_seqnum = htole64(r);
542
543         if (f->header->head_entry_seqnum == 0)
544                 f->header->head_entry_seqnum = htole64(r);
545
546         return r;
547 }
548
549 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
550         int r;
551         uint64_t p;
552         Object *tail, *o;
553         void *t;
554
555         assert(f);
556         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
557         assert(size >= sizeof(ObjectHeader));
558         assert(offset);
559         assert(ret);
560
561         r = journal_file_set_online(f);
562         if (r < 0)
563                 return r;
564
565         p = le64toh(f->header->tail_object_offset);
566         if (p == 0)
567                 p = le64toh(f->header->header_size);
568         else {
569                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
570                 if (r < 0)
571                         return r;
572
573                 p += ALIGN64(le64toh(tail->object.size));
574         }
575
576         r = journal_file_allocate(f, p, size);
577         if (r < 0)
578                 return r;
579
580         r = journal_file_move_to(f, type, false, p, size, &t);
581         if (r < 0)
582                 return r;
583
584         o = (Object*) t;
585
586         zero(o->object);
587         o->object.type = type;
588         o->object.size = htole64(size);
589
590         f->header->tail_object_offset = htole64(p);
591         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
592
593         *ret = o;
594         *offset = p;
595
596         return 0;
597 }
598
599 static int journal_file_setup_data_hash_table(JournalFile *f) {
600         uint64_t s, p;
601         Object *o;
602         int r;
603
604         assert(f);
605
606         /* We estimate that we need 1 hash table entry per 768 of
607            journal file and we want to make sure we never get beyond
608            75% fill level. Calculate the hash table size for the
609            maximum file size based on these metrics. */
610
611         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
612         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
613                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
614
615         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
616
617         r = journal_file_append_object(f,
618                                        OBJECT_DATA_HASH_TABLE,
619                                        offsetof(Object, hash_table.items) + s,
620                                        &o, &p);
621         if (r < 0)
622                 return r;
623
624         memzero(o->hash_table.items, s);
625
626         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
627         f->header->data_hash_table_size = htole64(s);
628
629         return 0;
630 }
631
632 static int journal_file_setup_field_hash_table(JournalFile *f) {
633         uint64_t s, p;
634         Object *o;
635         int r;
636
637         assert(f);
638
639         /* We use a fixed size hash table for the fields as this
640          * number should grow very slowly only */
641
642         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
643         r = journal_file_append_object(f,
644                                        OBJECT_FIELD_HASH_TABLE,
645                                        offsetof(Object, hash_table.items) + s,
646                                        &o, &p);
647         if (r < 0)
648                 return r;
649
650         memzero(o->hash_table.items, s);
651
652         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
653         f->header->field_hash_table_size = htole64(s);
654
655         return 0;
656 }
657
658 static int journal_file_map_data_hash_table(JournalFile *f) {
659         uint64_t s, p;
660         void *t;
661         int r;
662
663         assert(f);
664
665         p = le64toh(f->header->data_hash_table_offset);
666         s = le64toh(f->header->data_hash_table_size);
667
668         r = journal_file_move_to(f,
669                                  OBJECT_DATA_HASH_TABLE,
670                                  true,
671                                  p, s,
672                                  &t);
673         if (r < 0)
674                 return r;
675
676         f->data_hash_table = t;
677         return 0;
678 }
679
680 static int journal_file_map_field_hash_table(JournalFile *f) {
681         uint64_t s, p;
682         void *t;
683         int r;
684
685         assert(f);
686
687         p = le64toh(f->header->field_hash_table_offset);
688         s = le64toh(f->header->field_hash_table_size);
689
690         r = journal_file_move_to(f,
691                                  OBJECT_FIELD_HASH_TABLE,
692                                  true,
693                                  p, s,
694                                  &t);
695         if (r < 0)
696                 return r;
697
698         f->field_hash_table = t;
699         return 0;
700 }
701
702 static int journal_file_link_field(
703                 JournalFile *f,
704                 Object *o,
705                 uint64_t offset,
706                 uint64_t hash) {
707
708         uint64_t p, h, m;
709         int r;
710
711         assert(f);
712         assert(o);
713         assert(offset > 0);
714
715         if (o->object.type != OBJECT_FIELD)
716                 return -EINVAL;
717
718         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
719         if (m <= 0)
720                 return -EBADMSG;
721
722         /* This might alter the window we are looking at */
723         o->field.next_hash_offset = o->field.head_data_offset = 0;
724
725         h = hash % m;
726         p = le64toh(f->field_hash_table[h].tail_hash_offset);
727         if (p == 0)
728                 f->field_hash_table[h].head_hash_offset = htole64(offset);
729         else {
730                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
731                 if (r < 0)
732                         return r;
733
734                 o->field.next_hash_offset = htole64(offset);
735         }
736
737         f->field_hash_table[h].tail_hash_offset = htole64(offset);
738
739         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
740                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
741
742         return 0;
743 }
744
745 static int journal_file_link_data(
746                 JournalFile *f,
747                 Object *o,
748                 uint64_t offset,
749                 uint64_t hash) {
750
751         uint64_t p, h, m;
752         int r;
753
754         assert(f);
755         assert(o);
756         assert(offset > 0);
757
758         if (o->object.type != OBJECT_DATA)
759                 return -EINVAL;
760
761         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
762         if (m <= 0)
763                 return -EBADMSG;
764
765         /* This might alter the window we are looking at */
766         o->data.next_hash_offset = o->data.next_field_offset = 0;
767         o->data.entry_offset = o->data.entry_array_offset = 0;
768         o->data.n_entries = 0;
769
770         h = hash % m;
771         p = le64toh(f->data_hash_table[h].tail_hash_offset);
772         if (p == 0)
773                 /* Only entry in the hash table is easy */
774                 f->data_hash_table[h].head_hash_offset = htole64(offset);
775         else {
776                 /* Move back to the previous data object, to patch in
777                  * pointer */
778
779                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
780                 if (r < 0)
781                         return r;
782
783                 o->data.next_hash_offset = htole64(offset);
784         }
785
786         f->data_hash_table[h].tail_hash_offset = htole64(offset);
787
788         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
789                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
790
791         return 0;
792 }
793
794 int journal_file_find_field_object_with_hash(
795                 JournalFile *f,
796                 const void *field, uint64_t size, uint64_t hash,
797                 Object **ret, uint64_t *offset) {
798
799         uint64_t p, osize, h, m;
800         int r;
801
802         assert(f);
803         assert(field && size > 0);
804
805         osize = offsetof(Object, field.payload) + size;
806
807         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
808
809         if (m <= 0)
810                 return -EBADMSG;
811
812         h = hash % m;
813         p = le64toh(f->field_hash_table[h].head_hash_offset);
814
815         while (p > 0) {
816                 Object *o;
817
818                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
819                 if (r < 0)
820                         return r;
821
822                 if (le64toh(o->field.hash) == hash &&
823                     le64toh(o->object.size) == osize &&
824                     memcmp(o->field.payload, field, size) == 0) {
825
826                         if (ret)
827                                 *ret = o;
828                         if (offset)
829                                 *offset = p;
830
831                         return 1;
832                 }
833
834                 p = le64toh(o->field.next_hash_offset);
835         }
836
837         return 0;
838 }
839
840 int journal_file_find_field_object(
841                 JournalFile *f,
842                 const void *field, uint64_t size,
843                 Object **ret, uint64_t *offset) {
844
845         uint64_t hash;
846
847         assert(f);
848         assert(field && size > 0);
849
850         hash = hash64(field, size);
851
852         return journal_file_find_field_object_with_hash(f,
853                                                         field, size, hash,
854                                                         ret, offset);
855 }
856
857 int journal_file_find_data_object_with_hash(
858                 JournalFile *f,
859                 const void *data, uint64_t size, uint64_t hash,
860                 Object **ret, uint64_t *offset) {
861
862         uint64_t p, osize, h, m;
863         int r;
864
865         assert(f);
866         assert(data || size == 0);
867
868         osize = offsetof(Object, data.payload) + size;
869
870         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
871         if (m <= 0)
872                 return -EBADMSG;
873
874         h = hash % m;
875         p = le64toh(f->data_hash_table[h].head_hash_offset);
876
877         while (p > 0) {
878                 Object *o;
879
880                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
881                 if (r < 0)
882                         return r;
883
884                 if (le64toh(o->data.hash) != hash)
885                         goto next;
886
887                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
888 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
889                         uint64_t l;
890                         size_t rsize;
891
892                         l = le64toh(o->object.size);
893                         if (l <= offsetof(Object, data.payload))
894                                 return -EBADMSG;
895
896                         l -= offsetof(Object, data.payload);
897
898                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
899                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
900                         if (r < 0)
901                                 return r;
902
903                         if (rsize == size &&
904                             memcmp(f->compress_buffer, data, size) == 0) {
905
906                                 if (ret)
907                                         *ret = o;
908
909                                 if (offset)
910                                         *offset = p;
911
912                                 return 1;
913                         }
914 #else
915                         return -EPROTONOSUPPORT;
916 #endif
917                 } else if (le64toh(o->object.size) == osize &&
918                            memcmp(o->data.payload, data, size) == 0) {
919
920                         if (ret)
921                                 *ret = o;
922
923                         if (offset)
924                                 *offset = p;
925
926                         return 1;
927                 }
928
929         next:
930                 p = le64toh(o->data.next_hash_offset);
931         }
932
933         return 0;
934 }
935
936 int journal_file_find_data_object(
937                 JournalFile *f,
938                 const void *data, uint64_t size,
939                 Object **ret, uint64_t *offset) {
940
941         uint64_t hash;
942
943         assert(f);
944         assert(data || size == 0);
945
946         hash = hash64(data, size);
947
948         return journal_file_find_data_object_with_hash(f,
949                                                        data, size, hash,
950                                                        ret, offset);
951 }
952
953 static int journal_file_append_field(
954                 JournalFile *f,
955                 const void *field, uint64_t size,
956                 Object **ret, uint64_t *offset) {
957
958         uint64_t hash, p;
959         uint64_t osize;
960         Object *o;
961         int r;
962
963         assert(f);
964         assert(field && size > 0);
965
966         hash = hash64(field, size);
967
968         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
969         if (r < 0)
970                 return r;
971         else if (r > 0) {
972
973                 if (ret)
974                         *ret = o;
975
976                 if (offset)
977                         *offset = p;
978
979                 return 0;
980         }
981
982         osize = offsetof(Object, field.payload) + size;
983         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
984         if (r < 0)
985                 return r;
986
987         o->field.hash = htole64(hash);
988         memcpy(o->field.payload, field, size);
989
990         r = journal_file_link_field(f, o, p, hash);
991         if (r < 0)
992                 return r;
993
994         /* The linking might have altered the window, so let's
995          * refresh our pointer */
996         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
997         if (r < 0)
998                 return r;
999
1000 #ifdef HAVE_GCRYPT
1001         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1002         if (r < 0)
1003                 return r;
1004 #endif
1005
1006         if (ret)
1007                 *ret = o;
1008
1009         if (offset)
1010                 *offset = p;
1011
1012         return 0;
1013 }
1014
1015 static int journal_file_append_data(
1016                 JournalFile *f,
1017                 const void *data, uint64_t size,
1018                 Object **ret, uint64_t *offset) {
1019
1020         uint64_t hash, p;
1021         uint64_t osize;
1022         Object *o;
1023         int r, compression = 0;
1024         const void *eq;
1025
1026         assert(f);
1027         assert(data || size == 0);
1028
1029         hash = hash64(data, size);
1030
1031         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1032         if (r < 0)
1033                 return r;
1034         else if (r > 0) {
1035
1036                 if (ret)
1037                         *ret = o;
1038
1039                 if (offset)
1040                         *offset = p;
1041
1042                 return 0;
1043         }
1044
1045         osize = offsetof(Object, data.payload) + size;
1046         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1047         if (r < 0)
1048                 return r;
1049
1050         o->data.hash = htole64(hash);
1051
1052 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1053         if (f->compress_xz &&
1054             size >= COMPRESSION_SIZE_THRESHOLD) {
1055                 size_t rsize;
1056
1057                 compression = compress_blob(data, size, o->data.payload, &rsize);
1058
1059                 if (compression) {
1060                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1061                         o->object.flags |= compression;
1062
1063                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1064                                   size, rsize, object_compressed_to_string(compression));
1065                 }
1066         }
1067 #endif
1068
1069         if (!compression && size > 0)
1070                 memcpy(o->data.payload, data, size);
1071
1072         r = journal_file_link_data(f, o, p, hash);
1073         if (r < 0)
1074                 return r;
1075
1076         /* The linking might have altered the window, so let's
1077          * refresh our pointer */
1078         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1079         if (r < 0)
1080                 return r;
1081
1082         if (!data)
1083                 eq = NULL;
1084         else
1085                 eq = memchr(data, '=', size);
1086         if (eq && eq > data) {
1087                 Object *fo = NULL;
1088                 uint64_t fp;
1089
1090                 /* Create field object ... */
1091                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1092                 if (r < 0)
1093                         return r;
1094
1095                 /* ... and link it in. */
1096                 o->data.next_field_offset = fo->field.head_data_offset;
1097                 fo->field.head_data_offset = le64toh(p);
1098         }
1099
1100 #ifdef HAVE_GCRYPT
1101         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1102         if (r < 0)
1103                 return r;
1104 #endif
1105
1106         if (ret)
1107                 *ret = o;
1108
1109         if (offset)
1110                 *offset = p;
1111
1112         return 0;
1113 }
1114
1115 uint64_t journal_file_entry_n_items(Object *o) {
1116         assert(o);
1117
1118         if (o->object.type != OBJECT_ENTRY)
1119                 return 0;
1120
1121         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1122 }
1123
1124 uint64_t journal_file_entry_array_n_items(Object *o) {
1125         assert(o);
1126
1127         if (o->object.type != OBJECT_ENTRY_ARRAY)
1128                 return 0;
1129
1130         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1131 }
1132
1133 uint64_t journal_file_hash_table_n_items(Object *o) {
1134         assert(o);
1135
1136         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1137             o->object.type != OBJECT_FIELD_HASH_TABLE)
1138                 return 0;
1139
1140         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1141 }
1142
1143 static int link_entry_into_array(JournalFile *f,
1144                                  le64_t *first,
1145                                  le64_t *idx,
1146                                  uint64_t p) {
1147         int r;
1148         uint64_t n = 0, ap = 0, q, i, a, hidx;
1149         Object *o;
1150
1151         assert(f);
1152         assert(first);
1153         assert(idx);
1154         assert(p > 0);
1155
1156         a = le64toh(*first);
1157         i = hidx = le64toh(*idx);
1158         while (a > 0) {
1159
1160                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1161                 if (r < 0)
1162                         return r;
1163
1164                 n = journal_file_entry_array_n_items(o);
1165                 if (i < n) {
1166                         o->entry_array.items[i] = htole64(p);
1167                         *idx = htole64(hidx + 1);
1168                         return 0;
1169                 }
1170
1171                 i -= n;
1172                 ap = a;
1173                 a = le64toh(o->entry_array.next_entry_array_offset);
1174         }
1175
1176         if (hidx > n)
1177                 n = (hidx+1) * 2;
1178         else
1179                 n = n * 2;
1180
1181         if (n < 4)
1182                 n = 4;
1183
1184         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1185                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1186                                        &o, &q);
1187         if (r < 0)
1188                 return r;
1189
1190 #ifdef HAVE_GCRYPT
1191         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1192         if (r < 0)
1193                 return r;
1194 #endif
1195
1196         o->entry_array.items[i] = htole64(p);
1197
1198         if (ap == 0)
1199                 *first = htole64(q);
1200         else {
1201                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1202                 if (r < 0)
1203                         return r;
1204
1205                 o->entry_array.next_entry_array_offset = htole64(q);
1206         }
1207
1208         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1209                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1210
1211         *idx = htole64(hidx + 1);
1212
1213         return 0;
1214 }
1215
1216 static int link_entry_into_array_plus_one(JournalFile *f,
1217                                           le64_t *extra,
1218                                           le64_t *first,
1219                                           le64_t *idx,
1220                                           uint64_t p) {
1221
1222         int r;
1223
1224         assert(f);
1225         assert(extra);
1226         assert(first);
1227         assert(idx);
1228         assert(p > 0);
1229
1230         if (*idx == 0)
1231                 *extra = htole64(p);
1232         else {
1233                 le64_t i;
1234
1235                 i = htole64(le64toh(*idx) - 1);
1236                 r = link_entry_into_array(f, first, &i, p);
1237                 if (r < 0)
1238                         return r;
1239         }
1240
1241         *idx = htole64(le64toh(*idx) + 1);
1242         return 0;
1243 }
1244
1245 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1246         uint64_t p;
1247         int r;
1248         assert(f);
1249         assert(o);
1250         assert(offset > 0);
1251
1252         p = le64toh(o->entry.items[i].object_offset);
1253         if (p == 0)
1254                 return -EINVAL;
1255
1256         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1257         if (r < 0)
1258                 return r;
1259
1260         return link_entry_into_array_plus_one(f,
1261                                               &o->data.entry_offset,
1262                                               &o->data.entry_array_offset,
1263                                               &o->data.n_entries,
1264                                               offset);
1265 }
1266
1267 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1268         uint64_t n, i;
1269         int r;
1270
1271         assert(f);
1272         assert(o);
1273         assert(offset > 0);
1274
1275         if (o->object.type != OBJECT_ENTRY)
1276                 return -EINVAL;
1277
1278         __sync_synchronize();
1279
1280         /* Link up the entry itself */
1281         r = link_entry_into_array(f,
1282                                   &f->header->entry_array_offset,
1283                                   &f->header->n_entries,
1284                                   offset);
1285         if (r < 0)
1286                 return r;
1287
1288         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1289
1290         if (f->header->head_entry_realtime == 0)
1291                 f->header->head_entry_realtime = o->entry.realtime;
1292
1293         f->header->tail_entry_realtime = o->entry.realtime;
1294         f->header->tail_entry_monotonic = o->entry.monotonic;
1295
1296         f->tail_entry_monotonic_valid = true;
1297
1298         /* Link up the items */
1299         n = journal_file_entry_n_items(o);
1300         for (i = 0; i < n; i++) {
1301                 r = journal_file_link_entry_item(f, o, offset, i);
1302                 if (r < 0)
1303                         return r;
1304         }
1305
1306         return 0;
1307 }
1308
1309 static int journal_file_append_entry_internal(
1310                 JournalFile *f,
1311                 const dual_timestamp *ts,
1312                 uint64_t xor_hash,
1313                 const EntryItem items[], unsigned n_items,
1314                 uint64_t *seqnum,
1315                 Object **ret, uint64_t *offset) {
1316         uint64_t np;
1317         uint64_t osize;
1318         Object *o;
1319         int r;
1320
1321         assert(f);
1322         assert(items || n_items == 0);
1323         assert(ts);
1324
1325         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1326
1327         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1328         if (r < 0)
1329                 return r;
1330
1331         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1332         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1333         o->entry.realtime = htole64(ts->realtime);
1334         o->entry.monotonic = htole64(ts->monotonic);
1335         o->entry.xor_hash = htole64(xor_hash);
1336         o->entry.boot_id = f->header->boot_id;
1337
1338 #ifdef HAVE_GCRYPT
1339         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1340         if (r < 0)
1341                 return r;
1342 #endif
1343
1344         r = journal_file_link_entry(f, o, np);
1345         if (r < 0)
1346                 return r;
1347
1348         if (ret)
1349                 *ret = o;
1350
1351         if (offset)
1352                 *offset = np;
1353
1354         return 0;
1355 }
1356
1357 void journal_file_post_change(JournalFile *f) {
1358         assert(f);
1359
1360         /* inotify() does not receive IN_MODIFY events from file
1361          * accesses done via mmap(). After each access we hence
1362          * trigger IN_MODIFY by truncating the journal file to its
1363          * current size which triggers IN_MODIFY. */
1364
1365         __sync_synchronize();
1366
1367         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1368                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1369 }
1370
1371 static int entry_item_cmp(const void *_a, const void *_b) {
1372         const EntryItem *a = _a, *b = _b;
1373
1374         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1375                 return -1;
1376         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1377                 return 1;
1378         return 0;
1379 }
1380
1381 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1382         unsigned i;
1383         EntryItem *items;
1384         int r;
1385         uint64_t xor_hash = 0;
1386         struct dual_timestamp _ts;
1387
1388         assert(f);
1389         assert(iovec || n_iovec == 0);
1390
1391         if (!ts) {
1392                 dual_timestamp_get(&_ts);
1393                 ts = &_ts;
1394         }
1395
1396         if (f->tail_entry_monotonic_valid &&
1397             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1398                 return -EINVAL;
1399
1400 #ifdef HAVE_GCRYPT
1401         r = journal_file_maybe_append_tag(f, ts->realtime);
1402         if (r < 0)
1403                 return r;
1404 #endif
1405
1406         /* alloca() can't take 0, hence let's allocate at least one */
1407         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1408
1409         for (i = 0; i < n_iovec; i++) {
1410                 uint64_t p;
1411                 Object *o;
1412
1413                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1414                 if (r < 0)
1415                         return r;
1416
1417                 xor_hash ^= le64toh(o->data.hash);
1418                 items[i].object_offset = htole64(p);
1419                 items[i].hash = o->data.hash;
1420         }
1421
1422         /* Order by the position on disk, in order to improve seek
1423          * times for rotating media. */
1424         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1425
1426         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1427
1428         /* If the memory mapping triggered a SIGBUS then we return an
1429          * IO error and ignore the error code passed down to us, since
1430          * it is very likely just an effect of a nullified replacement
1431          * mapping page */
1432
1433         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1434                 r = -EIO;
1435
1436         journal_file_post_change(f);
1437
1438         return r;
1439 }
1440
1441 typedef struct ChainCacheItem {
1442         uint64_t first; /* the array at the beginning of the chain */
1443         uint64_t array; /* the cached array */
1444         uint64_t begin; /* the first item in the cached array */
1445         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1446         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1447 } ChainCacheItem;
1448
1449 static void chain_cache_put(
1450                 OrderedHashmap *h,
1451                 ChainCacheItem *ci,
1452                 uint64_t first,
1453                 uint64_t array,
1454                 uint64_t begin,
1455                 uint64_t total,
1456                 uint64_t last_index) {
1457
1458         if (!ci) {
1459                 /* If the chain item to cache for this chain is the
1460                  * first one it's not worth caching anything */
1461                 if (array == first)
1462                         return;
1463
1464                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1465                         ci = ordered_hashmap_steal_first(h);
1466                         assert(ci);
1467                 } else {
1468                         ci = new(ChainCacheItem, 1);
1469                         if (!ci)
1470                                 return;
1471                 }
1472
1473                 ci->first = first;
1474
1475                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1476                         free(ci);
1477                         return;
1478                 }
1479         } else
1480                 assert(ci->first == first);
1481
1482         ci->array = array;
1483         ci->begin = begin;
1484         ci->total = total;
1485         ci->last_index = last_index;
1486 }
1487
1488 static int generic_array_get(
1489                 JournalFile *f,
1490                 uint64_t first,
1491                 uint64_t i,
1492                 Object **ret, uint64_t *offset) {
1493
1494         Object *o;
1495         uint64_t p = 0, a, t = 0;
1496         int r;
1497         ChainCacheItem *ci;
1498
1499         assert(f);
1500
1501         a = first;
1502
1503         /* Try the chain cache first */
1504         ci = ordered_hashmap_get(f->chain_cache, &first);
1505         if (ci && i > ci->total) {
1506                 a = ci->array;
1507                 i -= ci->total;
1508                 t = ci->total;
1509         }
1510
1511         while (a > 0) {
1512                 uint64_t k;
1513
1514                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1515                 if (r < 0)
1516                         return r;
1517
1518                 k = journal_file_entry_array_n_items(o);
1519                 if (i < k) {
1520                         p = le64toh(o->entry_array.items[i]);
1521                         goto found;
1522                 }
1523
1524                 i -= k;
1525                 t += k;
1526                 a = le64toh(o->entry_array.next_entry_array_offset);
1527         }
1528
1529         return 0;
1530
1531 found:
1532         /* Let's cache this item for the next invocation */
1533         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1534
1535         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1536         if (r < 0)
1537                 return r;
1538
1539         if (ret)
1540                 *ret = o;
1541
1542         if (offset)
1543                 *offset = p;
1544
1545         return 1;
1546 }
1547
1548 static int generic_array_get_plus_one(
1549                 JournalFile *f,
1550                 uint64_t extra,
1551                 uint64_t first,
1552                 uint64_t i,
1553                 Object **ret, uint64_t *offset) {
1554
1555         Object *o;
1556
1557         assert(f);
1558
1559         if (i == 0) {
1560                 int r;
1561
1562                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1563                 if (r < 0)
1564                         return r;
1565
1566                 if (ret)
1567                         *ret = o;
1568
1569                 if (offset)
1570                         *offset = extra;
1571
1572                 return 1;
1573         }
1574
1575         return generic_array_get(f, first, i-1, ret, offset);
1576 }
1577
1578 enum {
1579         TEST_FOUND,
1580         TEST_LEFT,
1581         TEST_RIGHT
1582 };
1583
1584 static int generic_array_bisect(
1585                 JournalFile *f,
1586                 uint64_t first,
1587                 uint64_t n,
1588                 uint64_t needle,
1589                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1590                 direction_t direction,
1591                 Object **ret,
1592                 uint64_t *offset,
1593                 uint64_t *idx) {
1594
1595         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1596         bool subtract_one = false;
1597         Object *o, *array = NULL;
1598         int r;
1599         ChainCacheItem *ci;
1600
1601         assert(f);
1602         assert(test_object);
1603
1604         /* Start with the first array in the chain */
1605         a = first;
1606
1607         ci = ordered_hashmap_get(f->chain_cache, &first);
1608         if (ci && n > ci->total) {
1609                 /* Ah, we have iterated this bisection array chain
1610                  * previously! Let's see if we can skip ahead in the
1611                  * chain, as far as the last time. But we can't jump
1612                  * backwards in the chain, so let's check that
1613                  * first. */
1614
1615                 r = test_object(f, ci->begin, needle);
1616                 if (r < 0)
1617                         return r;
1618
1619                 if (r == TEST_LEFT) {
1620                         /* OK, what we are looking for is right of the
1621                          * begin of this EntryArray, so let's jump
1622                          * straight to previously cached array in the
1623                          * chain */
1624
1625                         a = ci->array;
1626                         n -= ci->total;
1627                         t = ci->total;
1628                         last_index = ci->last_index;
1629                 }
1630         }
1631
1632         while (a > 0) {
1633                 uint64_t left, right, k, lp;
1634
1635                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1636                 if (r < 0)
1637                         return r;
1638
1639                 k = journal_file_entry_array_n_items(array);
1640                 right = MIN(k, n);
1641                 if (right <= 0)
1642                         return 0;
1643
1644                 i = right - 1;
1645                 lp = p = le64toh(array->entry_array.items[i]);
1646                 if (p <= 0)
1647                         return -EBADMSG;
1648
1649                 r = test_object(f, p, needle);
1650                 if (r < 0)
1651                         return r;
1652
1653                 if (r == TEST_FOUND)
1654                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1655
1656                 if (r == TEST_RIGHT) {
1657                         left = 0;
1658                         right -= 1;
1659
1660                         if (last_index != (uint64_t) -1) {
1661                                 assert(last_index <= right);
1662
1663                                 /* If we cached the last index we
1664                                  * looked at, let's try to not to jump
1665                                  * too wildly around and see if we can
1666                                  * limit the range to look at early to
1667                                  * the immediate neighbors of the last
1668                                  * index we looked at. */
1669
1670                                 if (last_index > 0) {
1671                                         uint64_t x = last_index - 1;
1672
1673                                         p = le64toh(array->entry_array.items[x]);
1674                                         if (p <= 0)
1675                                                 return -EBADMSG;
1676
1677                                         r = test_object(f, p, needle);
1678                                         if (r < 0)
1679                                                 return r;
1680
1681                                         if (r == TEST_FOUND)
1682                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1683
1684                                         if (r == TEST_RIGHT)
1685                                                 right = x;
1686                                         else
1687                                                 left = x + 1;
1688                                 }
1689
1690                                 if (last_index < right) {
1691                                         uint64_t y = last_index + 1;
1692
1693                                         p = le64toh(array->entry_array.items[y]);
1694                                         if (p <= 0)
1695                                                 return -EBADMSG;
1696
1697                                         r = test_object(f, p, needle);
1698                                         if (r < 0)
1699                                                 return r;
1700
1701                                         if (r == TEST_FOUND)
1702                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1703
1704                                         if (r == TEST_RIGHT)
1705                                                 right = y;
1706                                         else
1707                                                 left = y + 1;
1708                                 }
1709                         }
1710
1711                         for (;;) {
1712                                 if (left == right) {
1713                                         if (direction == DIRECTION_UP)
1714                                                 subtract_one = true;
1715
1716                                         i = left;
1717                                         goto found;
1718                                 }
1719
1720                                 assert(left < right);
1721                                 i = (left + right) / 2;
1722
1723                                 p = le64toh(array->entry_array.items[i]);
1724                                 if (p <= 0)
1725                                         return -EBADMSG;
1726
1727                                 r = test_object(f, p, needle);
1728                                 if (r < 0)
1729                                         return r;
1730
1731                                 if (r == TEST_FOUND)
1732                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1733
1734                                 if (r == TEST_RIGHT)
1735                                         right = i;
1736                                 else
1737                                         left = i + 1;
1738                         }
1739                 }
1740
1741                 if (k >= n) {
1742                         if (direction == DIRECTION_UP) {
1743                                 i = n;
1744                                 subtract_one = true;
1745                                 goto found;
1746                         }
1747
1748                         return 0;
1749                 }
1750
1751                 last_p = lp;
1752
1753                 n -= k;
1754                 t += k;
1755                 last_index = (uint64_t) -1;
1756                 a = le64toh(array->entry_array.next_entry_array_offset);
1757         }
1758
1759         return 0;
1760
1761 found:
1762         if (subtract_one && t == 0 && i == 0)
1763                 return 0;
1764
1765         /* Let's cache this item for the next invocation */
1766         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1767
1768         if (subtract_one && i == 0)
1769                 p = last_p;
1770         else if (subtract_one)
1771                 p = le64toh(array->entry_array.items[i-1]);
1772         else
1773                 p = le64toh(array->entry_array.items[i]);
1774
1775         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1776         if (r < 0)
1777                 return r;
1778
1779         if (ret)
1780                 *ret = o;
1781
1782         if (offset)
1783                 *offset = p;
1784
1785         if (idx)
1786                 *idx = t + i + (subtract_one ? -1 : 0);
1787
1788         return 1;
1789 }
1790
1791 static int generic_array_bisect_plus_one(
1792                 JournalFile *f,
1793                 uint64_t extra,
1794                 uint64_t first,
1795                 uint64_t n,
1796                 uint64_t needle,
1797                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1798                 direction_t direction,
1799                 Object **ret,
1800                 uint64_t *offset,
1801                 uint64_t *idx) {
1802
1803         int r;
1804         bool step_back = false;
1805         Object *o;
1806
1807         assert(f);
1808         assert(test_object);
1809
1810         if (n <= 0)
1811                 return 0;
1812
1813         /* This bisects the array in object 'first', but first checks
1814          * an extra  */
1815         r = test_object(f, extra, needle);
1816         if (r < 0)
1817                 return r;
1818
1819         if (r == TEST_FOUND)
1820                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1821
1822         /* if we are looking with DIRECTION_UP then we need to first
1823            see if in the actual array there is a matching entry, and
1824            return the last one of that. But if there isn't any we need
1825            to return this one. Hence remember this, and return it
1826            below. */
1827         if (r == TEST_LEFT)
1828                 step_back = direction == DIRECTION_UP;
1829
1830         if (r == TEST_RIGHT) {
1831                 if (direction == DIRECTION_DOWN)
1832                         goto found;
1833                 else
1834                         return 0;
1835         }
1836
1837         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1838
1839         if (r == 0 && step_back)
1840                 goto found;
1841
1842         if (r > 0 && idx)
1843                 (*idx) ++;
1844
1845         return r;
1846
1847 found:
1848         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1849         if (r < 0)
1850                 return r;
1851
1852         if (ret)
1853                 *ret = o;
1854
1855         if (offset)
1856                 *offset = extra;
1857
1858         if (idx)
1859                 *idx = 0;
1860
1861         return 1;
1862 }
1863
1864 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1865         assert(f);
1866         assert(p > 0);
1867
1868         if (p == needle)
1869                 return TEST_FOUND;
1870         else if (p < needle)
1871                 return TEST_LEFT;
1872         else
1873                 return TEST_RIGHT;
1874 }
1875
1876 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1877         Object *o;
1878         int r;
1879
1880         assert(f);
1881         assert(p > 0);
1882
1883         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1884         if (r < 0)
1885                 return r;
1886
1887         if (le64toh(o->entry.seqnum) == needle)
1888                 return TEST_FOUND;
1889         else if (le64toh(o->entry.seqnum) < needle)
1890                 return TEST_LEFT;
1891         else
1892                 return TEST_RIGHT;
1893 }
1894
1895 int journal_file_move_to_entry_by_seqnum(
1896                 JournalFile *f,
1897                 uint64_t seqnum,
1898                 direction_t direction,
1899                 Object **ret,
1900                 uint64_t *offset) {
1901
1902         return generic_array_bisect(f,
1903                                     le64toh(f->header->entry_array_offset),
1904                                     le64toh(f->header->n_entries),
1905                                     seqnum,
1906                                     test_object_seqnum,
1907                                     direction,
1908                                     ret, offset, NULL);
1909 }
1910
1911 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1912         Object *o;
1913         int r;
1914
1915         assert(f);
1916         assert(p > 0);
1917
1918         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1919         if (r < 0)
1920                 return r;
1921
1922         if (le64toh(o->entry.realtime) == needle)
1923                 return TEST_FOUND;
1924         else if (le64toh(o->entry.realtime) < needle)
1925                 return TEST_LEFT;
1926         else
1927                 return TEST_RIGHT;
1928 }
1929
1930 int journal_file_move_to_entry_by_realtime(
1931                 JournalFile *f,
1932                 uint64_t realtime,
1933                 direction_t direction,
1934                 Object **ret,
1935                 uint64_t *offset) {
1936
1937         return generic_array_bisect(f,
1938                                     le64toh(f->header->entry_array_offset),
1939                                     le64toh(f->header->n_entries),
1940                                     realtime,
1941                                     test_object_realtime,
1942                                     direction,
1943                                     ret, offset, NULL);
1944 }
1945
1946 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1947         Object *o;
1948         int r;
1949
1950         assert(f);
1951         assert(p > 0);
1952
1953         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1954         if (r < 0)
1955                 return r;
1956
1957         if (le64toh(o->entry.monotonic) == needle)
1958                 return TEST_FOUND;
1959         else if (le64toh(o->entry.monotonic) < needle)
1960                 return TEST_LEFT;
1961         else
1962                 return TEST_RIGHT;
1963 }
1964
1965 static int find_data_object_by_boot_id(
1966                 JournalFile *f,
1967                 sd_id128_t boot_id,
1968                 Object **o,
1969                 uint64_t *b) {
1970
1971         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1972
1973         sd_id128_to_string(boot_id, t + 9);
1974         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1975 }
1976
1977 int journal_file_move_to_entry_by_monotonic(
1978                 JournalFile *f,
1979                 sd_id128_t boot_id,
1980                 uint64_t monotonic,
1981                 direction_t direction,
1982                 Object **ret,
1983                 uint64_t *offset) {
1984
1985         Object *o;
1986         int r;
1987
1988         assert(f);
1989
1990         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1991         if (r < 0)
1992                 return r;
1993         if (r == 0)
1994                 return -ENOENT;
1995
1996         return generic_array_bisect_plus_one(f,
1997                                              le64toh(o->data.entry_offset),
1998                                              le64toh(o->data.entry_array_offset),
1999                                              le64toh(o->data.n_entries),
2000                                              monotonic,
2001                                              test_object_monotonic,
2002                                              direction,
2003                                              ret, offset, NULL);
2004 }
2005
2006 void journal_file_reset_location(JournalFile *f) {
2007         f->location_type = LOCATION_HEAD;
2008         f->current_offset = 0;
2009         f->current_seqnum = 0;
2010         f->current_realtime = 0;
2011         f->current_monotonic = 0;
2012         zero(f->current_boot_id);
2013         f->current_xor_hash = 0;
2014 }
2015
2016 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2017         f->last_direction = direction;
2018         f->location_type = LOCATION_SEEK;
2019         f->current_offset = offset;
2020         f->current_seqnum = le64toh(o->entry.seqnum);
2021         f->current_realtime = le64toh(o->entry.realtime);
2022         f->current_monotonic = le64toh(o->entry.monotonic);
2023         f->current_boot_id = o->entry.boot_id;
2024         f->current_xor_hash = le64toh(o->entry.xor_hash);
2025 }
2026
2027 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2028         assert(af);
2029         assert(bf);
2030         assert(af->location_type == LOCATION_SEEK);
2031         assert(bf->location_type == LOCATION_SEEK);
2032
2033         /* If contents and timestamps match, these entries are
2034          * identical, even if the seqnum does not match */
2035         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2036             af->current_monotonic == bf->current_monotonic &&
2037             af->current_realtime == bf->current_realtime &&
2038             af->current_xor_hash == bf->current_xor_hash)
2039                 return 0;
2040
2041         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2042
2043                 /* If this is from the same seqnum source, compare
2044                  * seqnums */
2045                 if (af->current_seqnum < bf->current_seqnum)
2046                         return -1;
2047                 if (af->current_seqnum > bf->current_seqnum)
2048                         return 1;
2049
2050                 /* Wow! This is weird, different data but the same
2051                  * seqnums? Something is borked, but let's make the
2052                  * best of it and compare by time. */
2053         }
2054
2055         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2056
2057                 /* If the boot id matches, compare monotonic time */
2058                 if (af->current_monotonic < bf->current_monotonic)
2059                         return -1;
2060                 if (af->current_monotonic > bf->current_monotonic)
2061                         return 1;
2062         }
2063
2064         /* Otherwise, compare UTC time */
2065         if (af->current_realtime < bf->current_realtime)
2066                 return -1;
2067         if (af->current_realtime > bf->current_realtime)
2068                 return 1;
2069
2070         /* Finally, compare by contents */
2071         if (af->current_xor_hash < bf->current_xor_hash)
2072                 return -1;
2073         if (af->current_xor_hash > bf->current_xor_hash)
2074                 return 1;
2075
2076         return 0;
2077 }
2078
2079 int journal_file_next_entry(
2080                 JournalFile *f,
2081                 uint64_t p,
2082                 direction_t direction,
2083                 Object **ret, uint64_t *offset) {
2084
2085         uint64_t i, n, ofs;
2086         int r;
2087
2088         assert(f);
2089
2090         n = le64toh(f->header->n_entries);
2091         if (n <= 0)
2092                 return 0;
2093
2094         if (p == 0)
2095                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2096         else {
2097                 r = generic_array_bisect(f,
2098                                          le64toh(f->header->entry_array_offset),
2099                                          le64toh(f->header->n_entries),
2100                                          p,
2101                                          test_object_offset,
2102                                          DIRECTION_DOWN,
2103                                          NULL, NULL,
2104                                          &i);
2105                 if (r <= 0)
2106                         return r;
2107
2108                 if (direction == DIRECTION_DOWN) {
2109                         if (i >= n - 1)
2110                                 return 0;
2111
2112                         i++;
2113                 } else {
2114                         if (i <= 0)
2115                                 return 0;
2116
2117                         i--;
2118                 }
2119         }
2120
2121         /* And jump to it */
2122         r = generic_array_get(f,
2123                               le64toh(f->header->entry_array_offset),
2124                               i,
2125                               ret, &ofs);
2126         if (r <= 0)
2127                 return r;
2128
2129         if (p > 0 &&
2130             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2131                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2132                           f->path, i);
2133                 return -EBADMSG;
2134         }
2135
2136         if (offset)
2137                 *offset = ofs;
2138
2139         return 1;
2140 }
2141
2142 int journal_file_next_entry_for_data(
2143                 JournalFile *f,
2144                 Object *o, uint64_t p,
2145                 uint64_t data_offset,
2146                 direction_t direction,
2147                 Object **ret, uint64_t *offset) {
2148
2149         uint64_t n, i;
2150         int r;
2151         Object *d;
2152
2153         assert(f);
2154         assert(p > 0 || !o);
2155
2156         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2157         if (r < 0)
2158                 return r;
2159
2160         n = le64toh(d->data.n_entries);
2161         if (n <= 0)
2162                 return n;
2163
2164         if (!o)
2165                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2166         else {
2167                 if (o->object.type != OBJECT_ENTRY)
2168                         return -EINVAL;
2169
2170                 r = generic_array_bisect_plus_one(f,
2171                                                   le64toh(d->data.entry_offset),
2172                                                   le64toh(d->data.entry_array_offset),
2173                                                   le64toh(d->data.n_entries),
2174                                                   p,
2175                                                   test_object_offset,
2176                                                   DIRECTION_DOWN,
2177                                                   NULL, NULL,
2178                                                   &i);
2179
2180                 if (r <= 0)
2181                         return r;
2182
2183                 if (direction == DIRECTION_DOWN) {
2184                         if (i >= n - 1)
2185                                 return 0;
2186
2187                         i++;
2188                 } else {
2189                         if (i <= 0)
2190                                 return 0;
2191
2192                         i--;
2193                 }
2194
2195         }
2196
2197         return generic_array_get_plus_one(f,
2198                                           le64toh(d->data.entry_offset),
2199                                           le64toh(d->data.entry_array_offset),
2200                                           i,
2201                                           ret, offset);
2202 }
2203
2204 int journal_file_move_to_entry_by_offset_for_data(
2205                 JournalFile *f,
2206                 uint64_t data_offset,
2207                 uint64_t p,
2208                 direction_t direction,
2209                 Object **ret, uint64_t *offset) {
2210
2211         int r;
2212         Object *d;
2213
2214         assert(f);
2215
2216         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2217         if (r < 0)
2218                 return r;
2219
2220         return generic_array_bisect_plus_one(f,
2221                                              le64toh(d->data.entry_offset),
2222                                              le64toh(d->data.entry_array_offset),
2223                                              le64toh(d->data.n_entries),
2224                                              p,
2225                                              test_object_offset,
2226                                              direction,
2227                                              ret, offset, NULL);
2228 }
2229
2230 int journal_file_move_to_entry_by_monotonic_for_data(
2231                 JournalFile *f,
2232                 uint64_t data_offset,
2233                 sd_id128_t boot_id,
2234                 uint64_t monotonic,
2235                 direction_t direction,
2236                 Object **ret, uint64_t *offset) {
2237
2238         Object *o, *d;
2239         int r;
2240         uint64_t b, z;
2241
2242         assert(f);
2243
2244         /* First, seek by time */
2245         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2246         if (r < 0)
2247                 return r;
2248         if (r == 0)
2249                 return -ENOENT;
2250
2251         r = generic_array_bisect_plus_one(f,
2252                                           le64toh(o->data.entry_offset),
2253                                           le64toh(o->data.entry_array_offset),
2254                                           le64toh(o->data.n_entries),
2255                                           monotonic,
2256                                           test_object_monotonic,
2257                                           direction,
2258                                           NULL, &z, NULL);
2259         if (r <= 0)
2260                 return r;
2261
2262         /* And now, continue seeking until we find an entry that
2263          * exists in both bisection arrays */
2264
2265         for (;;) {
2266                 Object *qo;
2267                 uint64_t p, q;
2268
2269                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2270                 if (r < 0)
2271                         return r;
2272
2273                 r = generic_array_bisect_plus_one(f,
2274                                                   le64toh(d->data.entry_offset),
2275                                                   le64toh(d->data.entry_array_offset),
2276                                                   le64toh(d->data.n_entries),
2277                                                   z,
2278                                                   test_object_offset,
2279                                                   direction,
2280                                                   NULL, &p, NULL);
2281                 if (r <= 0)
2282                         return r;
2283
2284                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2285                 if (r < 0)
2286                         return r;
2287
2288                 r = generic_array_bisect_plus_one(f,
2289                                                   le64toh(o->data.entry_offset),
2290                                                   le64toh(o->data.entry_array_offset),
2291                                                   le64toh(o->data.n_entries),
2292                                                   p,
2293                                                   test_object_offset,
2294                                                   direction,
2295                                                   &qo, &q, NULL);
2296
2297                 if (r <= 0)
2298                         return r;
2299
2300                 if (p == q) {
2301                         if (ret)
2302                                 *ret = qo;
2303                         if (offset)
2304                                 *offset = q;
2305
2306                         return 1;
2307                 }
2308
2309                 z = q;
2310         }
2311 }
2312
2313 int journal_file_move_to_entry_by_seqnum_for_data(
2314                 JournalFile *f,
2315                 uint64_t data_offset,
2316                 uint64_t seqnum,
2317                 direction_t direction,
2318                 Object **ret, uint64_t *offset) {
2319
2320         Object *d;
2321         int r;
2322
2323         assert(f);
2324
2325         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2326         if (r < 0)
2327                 return r;
2328
2329         return generic_array_bisect_plus_one(f,
2330                                              le64toh(d->data.entry_offset),
2331                                              le64toh(d->data.entry_array_offset),
2332                                              le64toh(d->data.n_entries),
2333                                              seqnum,
2334                                              test_object_seqnum,
2335                                              direction,
2336                                              ret, offset, NULL);
2337 }
2338
2339 int journal_file_move_to_entry_by_realtime_for_data(
2340                 JournalFile *f,
2341                 uint64_t data_offset,
2342                 uint64_t realtime,
2343                 direction_t direction,
2344                 Object **ret, uint64_t *offset) {
2345
2346         Object *d;
2347         int r;
2348
2349         assert(f);
2350
2351         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2352         if (r < 0)
2353                 return r;
2354
2355         return generic_array_bisect_plus_one(f,
2356                                              le64toh(d->data.entry_offset),
2357                                              le64toh(d->data.entry_array_offset),
2358                                              le64toh(d->data.n_entries),
2359                                              realtime,
2360                                              test_object_realtime,
2361                                              direction,
2362                                              ret, offset, NULL);
2363 }
2364
2365 void journal_file_dump(JournalFile *f) {
2366         Object *o;
2367         int r;
2368         uint64_t p;
2369
2370         assert(f);
2371
2372         journal_file_print_header(f);
2373
2374         p = le64toh(f->header->header_size);
2375         while (p != 0) {
2376                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2377                 if (r < 0)
2378                         goto fail;
2379
2380                 switch (o->object.type) {
2381
2382                 case OBJECT_UNUSED:
2383                         printf("Type: OBJECT_UNUSED\n");
2384                         break;
2385
2386                 case OBJECT_DATA:
2387                         printf("Type: OBJECT_DATA\n");
2388                         break;
2389
2390                 case OBJECT_FIELD:
2391                         printf("Type: OBJECT_FIELD\n");
2392                         break;
2393
2394                 case OBJECT_ENTRY:
2395                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2396                                le64toh(o->entry.seqnum),
2397                                le64toh(o->entry.monotonic),
2398                                le64toh(o->entry.realtime));
2399                         break;
2400
2401                 case OBJECT_FIELD_HASH_TABLE:
2402                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2403                         break;
2404
2405                 case OBJECT_DATA_HASH_TABLE:
2406                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2407                         break;
2408
2409                 case OBJECT_ENTRY_ARRAY:
2410                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2411                         break;
2412
2413                 case OBJECT_TAG:
2414                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2415                                le64toh(o->tag.seqnum),
2416                                le64toh(o->tag.epoch));
2417                         break;
2418
2419                 default:
2420                         printf("Type: unknown (%i)\n", o->object.type);
2421                         break;
2422                 }
2423
2424                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2425                         printf("Flags: %s\n",
2426                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2427
2428                 if (p == le64toh(f->header->tail_object_offset))
2429                         p = 0;
2430                 else
2431                         p = p + ALIGN64(le64toh(o->object.size));
2432         }
2433
2434         return;
2435 fail:
2436         log_error("File corrupt");
2437 }
2438
2439 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2440         const char *x;
2441
2442         x = format_timestamp(buf, l, t);
2443         if (x)
2444                 return x;
2445         return " --- ";
2446 }
2447
2448 void journal_file_print_header(JournalFile *f) {
2449         char a[33], b[33], c[33], d[33];
2450         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2451         struct stat st;
2452         char bytes[FORMAT_BYTES_MAX];
2453
2454         assert(f);
2455
2456         printf("File Path: %s\n"
2457                "File ID: %s\n"
2458                "Machine ID: %s\n"
2459                "Boot ID: %s\n"
2460                "Sequential Number ID: %s\n"
2461                "State: %s\n"
2462                "Compatible Flags:%s%s\n"
2463                "Incompatible Flags:%s%s%s\n"
2464                "Header size: %"PRIu64"\n"
2465                "Arena size: %"PRIu64"\n"
2466                "Data Hash Table Size: %"PRIu64"\n"
2467                "Field Hash Table Size: %"PRIu64"\n"
2468                "Rotate Suggested: %s\n"
2469                "Head Sequential Number: %"PRIu64"\n"
2470                "Tail Sequential Number: %"PRIu64"\n"
2471                "Head Realtime Timestamp: %s\n"
2472                "Tail Realtime Timestamp: %s\n"
2473                "Tail Monotonic Timestamp: %s\n"
2474                "Objects: %"PRIu64"\n"
2475                "Entry Objects: %"PRIu64"\n",
2476                f->path,
2477                sd_id128_to_string(f->header->file_id, a),
2478                sd_id128_to_string(f->header->machine_id, b),
2479                sd_id128_to_string(f->header->boot_id, c),
2480                sd_id128_to_string(f->header->seqnum_id, d),
2481                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2482                f->header->state == STATE_ONLINE ? "ONLINE" :
2483                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2484                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2485                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2486                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2487                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2488                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2489                le64toh(f->header->header_size),
2490                le64toh(f->header->arena_size),
2491                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2492                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2493                yes_no(journal_file_rotate_suggested(f, 0)),
2494                le64toh(f->header->head_entry_seqnum),
2495                le64toh(f->header->tail_entry_seqnum),
2496                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2497                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2498                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2499                le64toh(f->header->n_objects),
2500                le64toh(f->header->n_entries));
2501
2502         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2503                 printf("Data Objects: %"PRIu64"\n"
2504                        "Data Hash Table Fill: %.1f%%\n",
2505                        le64toh(f->header->n_data),
2506                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2507
2508         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2509                 printf("Field Objects: %"PRIu64"\n"
2510                        "Field Hash Table Fill: %.1f%%\n",
2511                        le64toh(f->header->n_fields),
2512                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2513
2514         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2515                 printf("Tag Objects: %"PRIu64"\n",
2516                        le64toh(f->header->n_tags));
2517         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2518                 printf("Entry Array Objects: %"PRIu64"\n",
2519                        le64toh(f->header->n_entry_arrays));
2520
2521         if (fstat(f->fd, &st) >= 0)
2522                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2523 }
2524
2525 int journal_file_open(
2526                 const char *fname,
2527                 int flags,
2528                 mode_t mode,
2529                 bool compress,
2530                 bool seal,
2531                 JournalMetrics *metrics,
2532                 MMapCache *mmap_cache,
2533                 JournalFile *template,
2534                 JournalFile **ret) {
2535
2536         bool newly_created = false;
2537         JournalFile *f;
2538         void *h;
2539         int r;
2540
2541         assert(fname);
2542         assert(ret);
2543
2544         if ((flags & O_ACCMODE) != O_RDONLY &&
2545             (flags & O_ACCMODE) != O_RDWR)
2546                 return -EINVAL;
2547
2548         if (!endswith(fname, ".journal") &&
2549             !endswith(fname, ".journal~"))
2550                 return -EINVAL;
2551
2552         f = new0(JournalFile, 1);
2553         if (!f)
2554                 return -ENOMEM;
2555
2556         f->fd = -1;
2557         f->mode = mode;
2558
2559         f->flags = flags;
2560         f->prot = prot_from_flags(flags);
2561         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2562 #if defined(HAVE_LZ4)
2563         f->compress_lz4 = compress;
2564 #elif defined(HAVE_XZ)
2565         f->compress_xz = compress;
2566 #endif
2567 #ifdef HAVE_GCRYPT
2568         f->seal = seal;
2569 #endif
2570
2571         if (mmap_cache)
2572                 f->mmap = mmap_cache_ref(mmap_cache);
2573         else {
2574                 f->mmap = mmap_cache_new();
2575                 if (!f->mmap) {
2576                         r = -ENOMEM;
2577                         goto fail;
2578                 }
2579         }
2580
2581         f->path = strdup(fname);
2582         if (!f->path) {
2583                 r = -ENOMEM;
2584                 goto fail;
2585         }
2586
2587         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2588         if (!f->chain_cache) {
2589                 r = -ENOMEM;
2590                 goto fail;
2591         }
2592
2593         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2594         if (f->fd < 0) {
2595                 r = -errno;
2596                 goto fail;
2597         }
2598
2599         r = journal_file_fstat(f);
2600         if (r < 0)
2601                 goto fail;
2602
2603         if (f->last_stat.st_size == 0 && f->writable) {
2604
2605                 /* Before we write anything, turn off COW logic. Given
2606                  * our write pattern that is quite unfriendly to COW
2607                  * file systems this should greatly improve
2608                  * performance on COW file systems, such as btrfs, at
2609                  * the expense of data integrity features (which
2610                  * shouldn't be too bad, given that we do our own
2611                  * checksumming). */
2612                 r = chattr_fd(f->fd, true, FS_NOCOW_FL);
2613                 if (r < 0)
2614                         log_warning_errno(errno, "Failed to set file attributes: %m");
2615
2616                 /* Let's attach the creation time to the journal file,
2617                  * so that the vacuuming code knows the age of this
2618                  * file even if the file might end up corrupted one
2619                  * day... Ideally we'd just use the creation time many
2620                  * file systems maintain for each file, but there is
2621                  * currently no usable API to query this, hence let's
2622                  * emulate this via extended attributes. If extended
2623                  * attributes are not supported we'll just skip this,
2624                  * and rely solely on mtime/atime/ctime of the file. */
2625
2626                 fd_setcrtime(f->fd, 0);
2627
2628 #ifdef HAVE_GCRYPT
2629                 /* Try to load the FSPRG state, and if we can't, then
2630                  * just don't do sealing */
2631                 if (f->seal) {
2632                         r = journal_file_fss_load(f);
2633                         if (r < 0)
2634                                 f->seal = false;
2635                 }
2636 #endif
2637
2638                 r = journal_file_init_header(f, template);
2639                 if (r < 0)
2640                         goto fail;
2641
2642                 r = journal_file_fstat(f);
2643                 if (r < 0)
2644                         goto fail;
2645
2646                 newly_created = true;
2647         }
2648
2649         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2650                 r = -EIO;
2651                 goto fail;
2652         }
2653
2654         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2655         if (r < 0) {
2656                 r = -errno;
2657                 goto fail;
2658         }
2659
2660         f->header = h;
2661
2662         if (!newly_created) {
2663                 r = journal_file_verify_header(f);
2664                 if (r < 0)
2665                         goto fail;
2666         }
2667
2668 #ifdef HAVE_GCRYPT
2669         if (!newly_created && f->writable) {
2670                 r = journal_file_fss_load(f);
2671                 if (r < 0)
2672                         goto fail;
2673         }
2674 #endif
2675
2676         if (f->writable) {
2677                 if (metrics) {
2678                         journal_default_metrics(metrics, f->fd);
2679                         f->metrics = *metrics;
2680                 } else if (template)
2681                         f->metrics = template->metrics;
2682
2683                 r = journal_file_refresh_header(f);
2684                 if (r < 0)
2685                         goto fail;
2686         }
2687
2688 #ifdef HAVE_GCRYPT
2689         r = journal_file_hmac_setup(f);
2690         if (r < 0)
2691                 goto fail;
2692 #endif
2693
2694         if (newly_created) {
2695                 r = journal_file_setup_field_hash_table(f);
2696                 if (r < 0)
2697                         goto fail;
2698
2699                 r = journal_file_setup_data_hash_table(f);
2700                 if (r < 0)
2701                         goto fail;
2702
2703 #ifdef HAVE_GCRYPT
2704                 r = journal_file_append_first_tag(f);
2705                 if (r < 0)
2706                         goto fail;
2707 #endif
2708         }
2709
2710         r = journal_file_map_field_hash_table(f);
2711         if (r < 0)
2712                 goto fail;
2713
2714         r = journal_file_map_data_hash_table(f);
2715         if (r < 0)
2716                 goto fail;
2717
2718         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2719                 r = -EIO;
2720                 goto fail;
2721         }
2722
2723         *ret = f;
2724         return 0;
2725
2726 fail:
2727         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2728                 r = -EIO;
2729
2730         journal_file_close(f);
2731
2732         return r;
2733 }
2734
2735 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2736         _cleanup_free_ char *p = NULL;
2737         size_t l;
2738         JournalFile *old_file, *new_file = NULL;
2739         int r;
2740
2741         assert(f);
2742         assert(*f);
2743
2744         old_file = *f;
2745
2746         if (!old_file->writable)
2747                 return -EINVAL;
2748
2749         if (!endswith(old_file->path, ".journal"))
2750                 return -EINVAL;
2751
2752         l = strlen(old_file->path);
2753         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2754                      (int) l - 8, old_file->path,
2755                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2756                      le64toh((*f)->header->head_entry_seqnum),
2757                      le64toh((*f)->header->head_entry_realtime));
2758         if (r < 0)
2759                 return -ENOMEM;
2760
2761         /* Try to rename the file to the archived version. If the file
2762          * already was deleted, we'll get ENOENT, let's ignore that
2763          * case. */
2764         r = rename(old_file->path, p);
2765         if (r < 0 && errno != ENOENT)
2766                 return -errno;
2767
2768         old_file->header->state = STATE_ARCHIVED;
2769
2770         /* Currently, btrfs is not very good with out write patterns
2771          * and fragments heavily. Let's defrag our journal files when
2772          * we archive them */
2773         old_file->defrag_on_close = true;
2774
2775         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2776         journal_file_close(old_file);
2777
2778         *f = new_file;
2779         return r;
2780 }
2781
2782 int journal_file_open_reliably(
2783                 const char *fname,
2784                 int flags,
2785                 mode_t mode,
2786                 bool compress,
2787                 bool seal,
2788                 JournalMetrics *metrics,
2789                 MMapCache *mmap_cache,
2790                 JournalFile *template,
2791                 JournalFile **ret) {
2792
2793         int r;
2794         size_t l;
2795         _cleanup_free_ char *p = NULL;
2796
2797         r = journal_file_open(fname, flags, mode, compress, seal,
2798                               metrics, mmap_cache, template, ret);
2799         if (r != -EBADMSG && /* corrupted */
2800             r != -ENODATA && /* truncated */
2801             r != -EHOSTDOWN && /* other machine */
2802             r != -EPROTONOSUPPORT && /* incompatible feature */
2803             r != -EBUSY && /* unclean shutdown */
2804             r != -ESHUTDOWN && /* already archived */
2805             r != -EIO && /* IO error, including SIGBUS on mmap */
2806             r != -EIDRM /* File has been deleted */)
2807                 return r;
2808
2809         if ((flags & O_ACCMODE) == O_RDONLY)
2810                 return r;
2811
2812         if (!(flags & O_CREAT))
2813                 return r;
2814
2815         if (!endswith(fname, ".journal"))
2816                 return r;
2817
2818         /* The file is corrupted. Rotate it away and try it again (but only once) */
2819
2820         l = strlen(fname);
2821         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2822                      (int) l - 8, fname,
2823                      (unsigned long long) now(CLOCK_REALTIME),
2824                      random_u64()) < 0)
2825                 return -ENOMEM;
2826
2827         r = rename(fname, p);
2828         if (r < 0)
2829                 return -errno;
2830
2831         /* btrfs doesn't cope well with our write pattern and
2832          * fragments heavily. Let's defrag all files we rotate */
2833
2834         (void) chattr_path(p, false, FS_NOCOW_FL);
2835         (void) btrfs_defrag(p);
2836
2837         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2838
2839         return journal_file_open(fname, flags, mode, compress, seal,
2840                                  metrics, mmap_cache, template, ret);
2841 }
2842
2843 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2844         uint64_t i, n;
2845         uint64_t q, xor_hash = 0;
2846         int r;
2847         EntryItem *items;
2848         dual_timestamp ts;
2849
2850         assert(from);
2851         assert(to);
2852         assert(o);
2853         assert(p);
2854
2855         if (!to->writable)
2856                 return -EPERM;
2857
2858         ts.monotonic = le64toh(o->entry.monotonic);
2859         ts.realtime = le64toh(o->entry.realtime);
2860
2861         n = journal_file_entry_n_items(o);
2862         /* alloca() can't take 0, hence let's allocate at least one */
2863         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2864
2865         for (i = 0; i < n; i++) {
2866                 uint64_t l, h;
2867                 le64_t le_hash;
2868                 size_t t;
2869                 void *data;
2870                 Object *u;
2871
2872                 q = le64toh(o->entry.items[i].object_offset);
2873                 le_hash = o->entry.items[i].hash;
2874
2875                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2876                 if (r < 0)
2877                         return r;
2878
2879                 if (le_hash != o->data.hash)
2880                         return -EBADMSG;
2881
2882                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2883                 t = (size_t) l;
2884
2885                 /* We hit the limit on 32bit machines */
2886                 if ((uint64_t) t != l)
2887                         return -E2BIG;
2888
2889                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2890 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2891                         size_t rsize;
2892
2893                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2894                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2895                         if (r < 0)
2896                                 return r;
2897
2898                         data = from->compress_buffer;
2899                         l = rsize;
2900 #else
2901                         return -EPROTONOSUPPORT;
2902 #endif
2903                 } else
2904                         data = o->data.payload;
2905
2906                 r = journal_file_append_data(to, data, l, &u, &h);
2907                 if (r < 0)
2908                         return r;
2909
2910                 xor_hash ^= le64toh(u->data.hash);
2911                 items[i].object_offset = htole64(h);
2912                 items[i].hash = u->data.hash;
2913
2914                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2915                 if (r < 0)
2916                         return r;
2917         }
2918
2919         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2920
2921         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2922                 return -EIO;
2923
2924         return r;
2925 }
2926
2927 void journal_default_metrics(JournalMetrics *m, int fd) {
2928         uint64_t fs_size = 0;
2929         struct statvfs ss;
2930         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2931
2932         assert(m);
2933         assert(fd >= 0);
2934
2935         if (fstatvfs(fd, &ss) >= 0)
2936                 fs_size = ss.f_frsize * ss.f_blocks;
2937
2938         if (m->max_use == (uint64_t) -1) {
2939
2940                 if (fs_size > 0) {
2941                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2942
2943                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2944                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2945
2946                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2947                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2948                 } else
2949                         m->max_use = DEFAULT_MAX_USE_LOWER;
2950         } else {
2951                 m->max_use = PAGE_ALIGN(m->max_use);
2952
2953                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2954                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2955         }
2956
2957         if (m->max_size == (uint64_t) -1) {
2958                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2959
2960                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2961                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2962         } else
2963                 m->max_size = PAGE_ALIGN(m->max_size);
2964
2965         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2966                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2967
2968         if (m->max_size*2 > m->max_use)
2969                 m->max_use = m->max_size*2;
2970
2971         if (m->min_size == (uint64_t) -1)
2972                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2973         else {
2974                 m->min_size = PAGE_ALIGN(m->min_size);
2975
2976                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2977                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2978
2979                 if (m->min_size > m->max_size)
2980                         m->max_size = m->min_size;
2981         }
2982
2983         if (m->keep_free == (uint64_t) -1) {
2984
2985                 if (fs_size > 0) {
2986                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2987
2988                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2989                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2990
2991                 } else
2992                         m->keep_free = DEFAULT_KEEP_FREE;
2993         }
2994
2995         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2996                   format_bytes(a, sizeof(a), m->max_use),
2997                   format_bytes(b, sizeof(b), m->max_size),
2998                   format_bytes(c, sizeof(c), m->min_size),
2999                   format_bytes(d, sizeof(d), m->keep_free));
3000 }
3001
3002 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3003         assert(f);
3004         assert(from || to);
3005
3006         if (from) {
3007                 if (f->header->head_entry_realtime == 0)
3008                         return -ENOENT;
3009
3010                 *from = le64toh(f->header->head_entry_realtime);
3011         }
3012
3013         if (to) {
3014                 if (f->header->tail_entry_realtime == 0)
3015                         return -ENOENT;
3016
3017                 *to = le64toh(f->header->tail_entry_realtime);
3018         }
3019
3020         return 1;
3021 }
3022
3023 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3024         Object *o;
3025         uint64_t p;
3026         int r;
3027
3028         assert(f);
3029         assert(from || to);
3030
3031         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3032         if (r <= 0)
3033                 return r;
3034
3035         if (le64toh(o->data.n_entries) <= 0)
3036                 return 0;
3037
3038         if (from) {
3039                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3040                 if (r < 0)
3041                         return r;
3042
3043                 *from = le64toh(o->entry.monotonic);
3044         }
3045
3046         if (to) {
3047                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3048                 if (r < 0)
3049                         return r;
3050
3051                 r = generic_array_get_plus_one(f,
3052                                                le64toh(o->data.entry_offset),
3053                                                le64toh(o->data.entry_array_offset),
3054                                                le64toh(o->data.n_entries)-1,
3055                                                &o, NULL);
3056                 if (r <= 0)
3057                         return r;
3058
3059                 *to = le64toh(o->entry.monotonic);
3060         }
3061
3062         return 1;
3063 }
3064
3065 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3066         assert(f);
3067
3068         /* If we gained new header fields we gained new features,
3069          * hence suggest a rotation */
3070         if (le64toh(f->header->header_size) < sizeof(Header)) {
3071                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3072                 return true;
3073         }
3074
3075         /* Let's check if the hash tables grew over a certain fill
3076          * level (75%, borrowing this value from Java's hash table
3077          * implementation), and if so suggest a rotation. To calculate
3078          * the fill level we need the n_data field, which only exists
3079          * in newer versions. */
3080
3081         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3082                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3083                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3084                                   f->path,
3085                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3086                                   le64toh(f->header->n_data),
3087                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3088                                   (unsigned long long) f->last_stat.st_size,
3089                                   f->last_stat.st_size / le64toh(f->header->n_data));
3090                         return true;
3091                 }
3092
3093         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3094                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3095                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3096                                   f->path,
3097                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3098                                   le64toh(f->header->n_fields),
3099                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3100                         return true;
3101                 }
3102
3103         /* Are the data objects properly indexed by field objects? */
3104         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3105             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3106             le64toh(f->header->n_data) > 0 &&
3107             le64toh(f->header->n_fields) == 0)
3108                 return true;
3109
3110         if (max_file_usec > 0) {
3111                 usec_t t, h;
3112
3113                 h = le64toh(f->header->head_entry_realtime);
3114                 t = now(CLOCK_REALTIME);
3115
3116                 if (h > 0 && t > h + max_file_usec)
3117                         return true;
3118         }
3119
3120         return false;
3121 }