chiark / gitweb /
journal: consider file deletion errors a reason for rotation
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "btrfs-util.h"
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
34 #include "lookup3.h"
35 #include "compress.h"
36 #include "fsprg.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47  * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55  * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59  * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
70
71 /* Reread fstat() of the file for detecting deletions at least this often */
72 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
73
74 /* The mmap context to use for the header we pick as one above the last defined typed */
75 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
76
77 static int journal_file_set_online(JournalFile *f) {
78         assert(f);
79
80         if (!f->writable)
81                 return -EPERM;
82
83         if (!(f->fd >= 0 && f->header))
84                 return -EINVAL;
85
86         if (mmap_cache_got_sigbus(f->mmap, f->fd))
87                 return -EIO;
88
89         switch(f->header->state) {
90                 case STATE_ONLINE:
91                         return 0;
92
93                 case STATE_OFFLINE:
94                         f->header->state = STATE_ONLINE;
95                         fsync(f->fd);
96                         return 0;
97
98                 default:
99                         return -EINVAL;
100         }
101 }
102
103 int journal_file_set_offline(JournalFile *f) {
104         assert(f);
105
106         if (!f->writable)
107                 return -EPERM;
108
109         if (!(f->fd >= 0 && f->header))
110                 return -EINVAL;
111
112         if (f->header->state != STATE_ONLINE)
113                 return 0;
114
115         fsync(f->fd);
116
117         if (mmap_cache_got_sigbus(f->mmap, f->fd))
118                 return -EIO;
119
120         f->header->state = STATE_OFFLINE;
121
122         if (mmap_cache_got_sigbus(f->mmap, f->fd))
123                 return -EIO;
124
125         fsync(f->fd);
126
127         return 0;
128 }
129
130 void journal_file_close(JournalFile *f) {
131         assert(f);
132
133 #ifdef HAVE_GCRYPT
134         /* Write the final tag */
135         if (f->seal && f->writable)
136                 journal_file_append_tag(f);
137 #endif
138
139         journal_file_set_offline(f);
140
141         if (f->mmap && f->fd >= 0)
142                 mmap_cache_close_fd(f->mmap, f->fd);
143
144         if (f->fd >= 0 && f->defrag_on_close)
145                 btrfs_defrag_fd(f->fd);
146
147         safe_close(f->fd);
148         free(f->path);
149
150         if (f->mmap)
151                 mmap_cache_unref(f->mmap);
152
153         ordered_hashmap_free_free(f->chain_cache);
154
155 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
156         free(f->compress_buffer);
157 #endif
158
159 #ifdef HAVE_GCRYPT
160         if (f->fss_file)
161                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
162         else if (f->fsprg_state)
163                 free(f->fsprg_state);
164
165         free(f->fsprg_seed);
166
167         if (f->hmac)
168                 gcry_md_close(f->hmac);
169 #endif
170
171         free(f);
172 }
173
174 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
175         Header h = {};
176         ssize_t k;
177         int r;
178
179         assert(f);
180
181         memcpy(h.signature, HEADER_SIGNATURE, 8);
182         h.header_size = htole64(ALIGN64(sizeof(h)));
183
184         h.incompatible_flags |= htole32(
185                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
186                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
187
188         h.compatible_flags = htole32(
189                 f->seal * HEADER_COMPATIBLE_SEALED);
190
191         r = sd_id128_randomize(&h.file_id);
192         if (r < 0)
193                 return r;
194
195         if (template) {
196                 h.seqnum_id = template->header->seqnum_id;
197                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
198         } else
199                 h.seqnum_id = h.file_id;
200
201         k = pwrite(f->fd, &h, sizeof(h), 0);
202         if (k < 0)
203                 return -errno;
204
205         if (k != sizeof(h))
206                 return -EIO;
207
208         return 0;
209 }
210
211 static int journal_file_refresh_header(JournalFile *f) {
212         sd_id128_t boot_id;
213         int r;
214
215         assert(f);
216
217         r = sd_id128_get_machine(&f->header->machine_id);
218         if (r < 0)
219                 return r;
220
221         r = sd_id128_get_boot(&boot_id);
222         if (r < 0)
223                 return r;
224
225         if (sd_id128_equal(boot_id, f->header->boot_id))
226                 f->tail_entry_monotonic_valid = true;
227
228         f->header->boot_id = boot_id;
229
230         r = journal_file_set_online(f);
231
232         /* Sync the online state to disk */
233         fsync(f->fd);
234
235         return r;
236 }
237
238 static int journal_file_verify_header(JournalFile *f) {
239         uint32_t flags;
240
241         assert(f);
242
243         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
244                 return -EBADMSG;
245
246         /* In both read and write mode we refuse to open files with
247          * incompatible flags we don't know */
248         flags = le32toh(f->header->incompatible_flags);
249         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
250                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
251                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
252                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
253                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
254                 if (flags)
255                         log_debug("Journal file %s uses incompatible flags %"PRIx32
256                                   " disabled at compilation time.", f->path, flags);
257                 return -EPROTONOSUPPORT;
258         }
259
260         /* When open for writing we refuse to open files with
261          * compatible flags, too */
262         flags = le32toh(f->header->compatible_flags);
263         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
264                 if (flags & ~HEADER_COMPATIBLE_ANY)
265                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
266                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
267                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
268                 if (flags)
269                         log_debug("Journal file %s uses compatible flags %"PRIx32
270                                   " disabled at compilation time.", f->path, flags);
271                 return -EPROTONOSUPPORT;
272         }
273
274         if (f->header->state >= _STATE_MAX)
275                 return -EBADMSG;
276
277         /* The first addition was n_data, so check that we are at least this large */
278         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
279                 return -EBADMSG;
280
281         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
282                 return -EBADMSG;
283
284         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
285                 return -ENODATA;
286
287         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
288                 return -ENODATA;
289
290         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
291             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
292             !VALID64(le64toh(f->header->tail_object_offset)) ||
293             !VALID64(le64toh(f->header->entry_array_offset)))
294                 return -ENODATA;
295
296         if (f->writable) {
297                 uint8_t state;
298                 sd_id128_t machine_id;
299                 int r;
300
301                 r = sd_id128_get_machine(&machine_id);
302                 if (r < 0)
303                         return r;
304
305                 if (!sd_id128_equal(machine_id, f->header->machine_id))
306                         return -EHOSTDOWN;
307
308                 state = f->header->state;
309
310                 if (state == STATE_ONLINE) {
311                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
312                         return -EBUSY;
313                 } else if (state == STATE_ARCHIVED)
314                         return -ESHUTDOWN;
315                 else if (state != STATE_OFFLINE) {
316                         log_debug("Journal file %s has unknown state %u.", f->path, state);
317                         return -EBUSY;
318                 }
319         }
320
321         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
322         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
323
324         f->seal = JOURNAL_HEADER_SEALED(f->header);
325
326         return 0;
327 }
328
329 static int journal_file_fstat(JournalFile *f) {
330         assert(f);
331         assert(f->fd >= 0);
332
333         if (fstat(f->fd, &f->last_stat) < 0)
334                 return -errno;
335
336         f->last_stat_usec = now(CLOCK_MONOTONIC);
337
338         /* Refuse appending to files that are already deleted */
339         if (f->last_stat.st_nlink <= 0)
340                 return -EIDRM;
341
342         return 0;
343 }
344
345 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
346         uint64_t old_size, new_size;
347         int r;
348
349         assert(f);
350
351         /* We assume that this file is not sparse, and we know that
352          * for sure, since we always call posix_fallocate()
353          * ourselves */
354
355         if (mmap_cache_got_sigbus(f->mmap, f->fd))
356                 return -EIO;
357
358         old_size =
359                 le64toh(f->header->header_size) +
360                 le64toh(f->header->arena_size);
361
362         new_size = PAGE_ALIGN(offset + size);
363         if (new_size < le64toh(f->header->header_size))
364                 new_size = le64toh(f->header->header_size);
365
366         if (new_size <= old_size) {
367
368                 /* We already pre-allocated enough space, but before
369                  * we write to it, let's check with fstat() if the
370                  * file got deleted, in order make sure we don't throw
371                  * away the data immediately. Don't check fstat() for
372                  * all writes though, but only once ever 10s. */
373
374                 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
375                         return 0;
376
377                 return journal_file_fstat(f);
378         }
379
380         /* Allocate more space. */
381
382         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
383                 return -E2BIG;
384
385         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
386                 struct statvfs svfs;
387
388                 if (fstatvfs(f->fd, &svfs) >= 0) {
389                         uint64_t available;
390
391                         available = svfs.f_bfree * svfs.f_bsize;
392
393                         if (available >= f->metrics.keep_free)
394                                 available -= f->metrics.keep_free;
395                         else
396                                 available = 0;
397
398                         if (new_size - old_size > available)
399                                 return -E2BIG;
400                 }
401         }
402
403         /* Increase by larger blocks at once */
404         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
405         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
406                 new_size = f->metrics.max_size;
407
408         /* Note that the glibc fallocate() fallback is very
409            inefficient, hence we try to minimize the allocation area
410            as we can. */
411         r = posix_fallocate(f->fd, old_size, new_size - old_size);
412         if (r != 0)
413                 return -r;
414
415         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
416
417         return journal_file_fstat(f);
418 }
419
420 static unsigned type_to_context(ObjectType type) {
421         /* One context for each type, plus one catch-all for the rest */
422         assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
423         assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
424         return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
425 }
426
427 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
428         int r;
429
430         assert(f);
431         assert(ret);
432
433         if (size <= 0)
434                 return -EINVAL;
435
436         /* Avoid SIGBUS on invalid accesses */
437         if (offset + size > (uint64_t) f->last_stat.st_size) {
438                 /* Hmm, out of range? Let's refresh the fstat() data
439                  * first, before we trust that check. */
440
441                 r = journal_file_fstat(f);
442                 if (r < 0)
443                         return r;
444
445                 if (offset + size > (uint64_t) f->last_stat.st_size)
446                         return -EADDRNOTAVAIL;
447         }
448
449         return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
450 }
451
452 static uint64_t minimum_header_size(Object *o) {
453
454         static const uint64_t table[] = {
455                 [OBJECT_DATA] = sizeof(DataObject),
456                 [OBJECT_FIELD] = sizeof(FieldObject),
457                 [OBJECT_ENTRY] = sizeof(EntryObject),
458                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
459                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
460                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
461                 [OBJECT_TAG] = sizeof(TagObject),
462         };
463
464         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
465                 return sizeof(ObjectHeader);
466
467         return table[o->object.type];
468 }
469
470 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
471         int r;
472         void *t;
473         Object *o;
474         uint64_t s;
475
476         assert(f);
477         assert(ret);
478
479         /* Objects may only be located at multiple of 64 bit */
480         if (!VALID64(offset))
481                 return -EFAULT;
482
483         r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
484         if (r < 0)
485                 return r;
486
487         o = (Object*) t;
488         s = le64toh(o->object.size);
489
490         if (s < sizeof(ObjectHeader))
491                 return -EBADMSG;
492
493         if (o->object.type <= OBJECT_UNUSED)
494                 return -EBADMSG;
495
496         if (s < minimum_header_size(o))
497                 return -EBADMSG;
498
499         if (type > OBJECT_UNUSED && o->object.type != type)
500                 return -EBADMSG;
501
502         if (s > sizeof(ObjectHeader)) {
503                 r = journal_file_move_to(f, type, false, offset, s, &t);
504                 if (r < 0)
505                         return r;
506
507                 o = (Object*) t;
508         }
509
510         *ret = o;
511         return 0;
512 }
513
514 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
515         uint64_t r;
516
517         assert(f);
518
519         r = le64toh(f->header->tail_entry_seqnum) + 1;
520
521         if (seqnum) {
522                 /* If an external seqnum counter was passed, we update
523                  * both the local and the external one, and set it to
524                  * the maximum of both */
525
526                 if (*seqnum + 1 > r)
527                         r = *seqnum + 1;
528
529                 *seqnum = r;
530         }
531
532         f->header->tail_entry_seqnum = htole64(r);
533
534         if (f->header->head_entry_seqnum == 0)
535                 f->header->head_entry_seqnum = htole64(r);
536
537         return r;
538 }
539
540 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
541         int r;
542         uint64_t p;
543         Object *tail, *o;
544         void *t;
545
546         assert(f);
547         assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
548         assert(size >= sizeof(ObjectHeader));
549         assert(offset);
550         assert(ret);
551
552         r = journal_file_set_online(f);
553         if (r < 0)
554                 return r;
555
556         p = le64toh(f->header->tail_object_offset);
557         if (p == 0)
558                 p = le64toh(f->header->header_size);
559         else {
560                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
561                 if (r < 0)
562                         return r;
563
564                 p += ALIGN64(le64toh(tail->object.size));
565         }
566
567         r = journal_file_allocate(f, p, size);
568         if (r < 0)
569                 return r;
570
571         r = journal_file_move_to(f, type, false, p, size, &t);
572         if (r < 0)
573                 return r;
574
575         o = (Object*) t;
576
577         zero(o->object);
578         o->object.type = type;
579         o->object.size = htole64(size);
580
581         f->header->tail_object_offset = htole64(p);
582         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
583
584         *ret = o;
585         *offset = p;
586
587         return 0;
588 }
589
590 static int journal_file_setup_data_hash_table(JournalFile *f) {
591         uint64_t s, p;
592         Object *o;
593         int r;
594
595         assert(f);
596
597         /* We estimate that we need 1 hash table entry per 768 of
598            journal file and we want to make sure we never get beyond
599            75% fill level. Calculate the hash table size for the
600            maximum file size based on these metrics. */
601
602         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
603         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
604                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
605
606         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
607
608         r = journal_file_append_object(f,
609                                        OBJECT_DATA_HASH_TABLE,
610                                        offsetof(Object, hash_table.items) + s,
611                                        &o, &p);
612         if (r < 0)
613                 return r;
614
615         memzero(o->hash_table.items, s);
616
617         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
618         f->header->data_hash_table_size = htole64(s);
619
620         return 0;
621 }
622
623 static int journal_file_setup_field_hash_table(JournalFile *f) {
624         uint64_t s, p;
625         Object *o;
626         int r;
627
628         assert(f);
629
630         /* We use a fixed size hash table for the fields as this
631          * number should grow very slowly only */
632
633         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
634         r = journal_file_append_object(f,
635                                        OBJECT_FIELD_HASH_TABLE,
636                                        offsetof(Object, hash_table.items) + s,
637                                        &o, &p);
638         if (r < 0)
639                 return r;
640
641         memzero(o->hash_table.items, s);
642
643         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
644         f->header->field_hash_table_size = htole64(s);
645
646         return 0;
647 }
648
649 static int journal_file_map_data_hash_table(JournalFile *f) {
650         uint64_t s, p;
651         void *t;
652         int r;
653
654         assert(f);
655
656         p = le64toh(f->header->data_hash_table_offset);
657         s = le64toh(f->header->data_hash_table_size);
658
659         r = journal_file_move_to(f,
660                                  OBJECT_DATA_HASH_TABLE,
661                                  true,
662                                  p, s,
663                                  &t);
664         if (r < 0)
665                 return r;
666
667         f->data_hash_table = t;
668         return 0;
669 }
670
671 static int journal_file_map_field_hash_table(JournalFile *f) {
672         uint64_t s, p;
673         void *t;
674         int r;
675
676         assert(f);
677
678         p = le64toh(f->header->field_hash_table_offset);
679         s = le64toh(f->header->field_hash_table_size);
680
681         r = journal_file_move_to(f,
682                                  OBJECT_FIELD_HASH_TABLE,
683                                  true,
684                                  p, s,
685                                  &t);
686         if (r < 0)
687                 return r;
688
689         f->field_hash_table = t;
690         return 0;
691 }
692
693 static int journal_file_link_field(
694                 JournalFile *f,
695                 Object *o,
696                 uint64_t offset,
697                 uint64_t hash) {
698
699         uint64_t p, h, m;
700         int r;
701
702         assert(f);
703         assert(o);
704         assert(offset > 0);
705
706         if (o->object.type != OBJECT_FIELD)
707                 return -EINVAL;
708
709         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
710         if (m <= 0)
711                 return -EBADMSG;
712
713         /* This might alter the window we are looking at */
714         o->field.next_hash_offset = o->field.head_data_offset = 0;
715
716         h = hash % m;
717         p = le64toh(f->field_hash_table[h].tail_hash_offset);
718         if (p == 0)
719                 f->field_hash_table[h].head_hash_offset = htole64(offset);
720         else {
721                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
722                 if (r < 0)
723                         return r;
724
725                 o->field.next_hash_offset = htole64(offset);
726         }
727
728         f->field_hash_table[h].tail_hash_offset = htole64(offset);
729
730         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
731                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
732
733         return 0;
734 }
735
736 static int journal_file_link_data(
737                 JournalFile *f,
738                 Object *o,
739                 uint64_t offset,
740                 uint64_t hash) {
741
742         uint64_t p, h, m;
743         int r;
744
745         assert(f);
746         assert(o);
747         assert(offset > 0);
748
749         if (o->object.type != OBJECT_DATA)
750                 return -EINVAL;
751
752         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
753         if (m <= 0)
754                 return -EBADMSG;
755
756         /* This might alter the window we are looking at */
757         o->data.next_hash_offset = o->data.next_field_offset = 0;
758         o->data.entry_offset = o->data.entry_array_offset = 0;
759         o->data.n_entries = 0;
760
761         h = hash % m;
762         p = le64toh(f->data_hash_table[h].tail_hash_offset);
763         if (p == 0)
764                 /* Only entry in the hash table is easy */
765                 f->data_hash_table[h].head_hash_offset = htole64(offset);
766         else {
767                 /* Move back to the previous data object, to patch in
768                  * pointer */
769
770                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
771                 if (r < 0)
772                         return r;
773
774                 o->data.next_hash_offset = htole64(offset);
775         }
776
777         f->data_hash_table[h].tail_hash_offset = htole64(offset);
778
779         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
780                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
781
782         return 0;
783 }
784
785 int journal_file_find_field_object_with_hash(
786                 JournalFile *f,
787                 const void *field, uint64_t size, uint64_t hash,
788                 Object **ret, uint64_t *offset) {
789
790         uint64_t p, osize, h, m;
791         int r;
792
793         assert(f);
794         assert(field && size > 0);
795
796         osize = offsetof(Object, field.payload) + size;
797
798         m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
799
800         if (m <= 0)
801                 return -EBADMSG;
802
803         h = hash % m;
804         p = le64toh(f->field_hash_table[h].head_hash_offset);
805
806         while (p > 0) {
807                 Object *o;
808
809                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
810                 if (r < 0)
811                         return r;
812
813                 if (le64toh(o->field.hash) == hash &&
814                     le64toh(o->object.size) == osize &&
815                     memcmp(o->field.payload, field, size) == 0) {
816
817                         if (ret)
818                                 *ret = o;
819                         if (offset)
820                                 *offset = p;
821
822                         return 1;
823                 }
824
825                 p = le64toh(o->field.next_hash_offset);
826         }
827
828         return 0;
829 }
830
831 int journal_file_find_field_object(
832                 JournalFile *f,
833                 const void *field, uint64_t size,
834                 Object **ret, uint64_t *offset) {
835
836         uint64_t hash;
837
838         assert(f);
839         assert(field && size > 0);
840
841         hash = hash64(field, size);
842
843         return journal_file_find_field_object_with_hash(f,
844                                                         field, size, hash,
845                                                         ret, offset);
846 }
847
848 int journal_file_find_data_object_with_hash(
849                 JournalFile *f,
850                 const void *data, uint64_t size, uint64_t hash,
851                 Object **ret, uint64_t *offset) {
852
853         uint64_t p, osize, h, m;
854         int r;
855
856         assert(f);
857         assert(data || size == 0);
858
859         osize = offsetof(Object, data.payload) + size;
860
861         m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
862         if (m <= 0)
863                 return -EBADMSG;
864
865         h = hash % m;
866         p = le64toh(f->data_hash_table[h].head_hash_offset);
867
868         while (p > 0) {
869                 Object *o;
870
871                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
872                 if (r < 0)
873                         return r;
874
875                 if (le64toh(o->data.hash) != hash)
876                         goto next;
877
878                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
879 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
880                         uint64_t l;
881                         size_t rsize;
882
883                         l = le64toh(o->object.size);
884                         if (l <= offsetof(Object, data.payload))
885                                 return -EBADMSG;
886
887                         l -= offsetof(Object, data.payload);
888
889                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
890                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
891                         if (r < 0)
892                                 return r;
893
894                         if (rsize == size &&
895                             memcmp(f->compress_buffer, data, size) == 0) {
896
897                                 if (ret)
898                                         *ret = o;
899
900                                 if (offset)
901                                         *offset = p;
902
903                                 return 1;
904                         }
905 #else
906                         return -EPROTONOSUPPORT;
907 #endif
908                 } else if (le64toh(o->object.size) == osize &&
909                            memcmp(o->data.payload, data, size) == 0) {
910
911                         if (ret)
912                                 *ret = o;
913
914                         if (offset)
915                                 *offset = p;
916
917                         return 1;
918                 }
919
920         next:
921                 p = le64toh(o->data.next_hash_offset);
922         }
923
924         return 0;
925 }
926
927 int journal_file_find_data_object(
928                 JournalFile *f,
929                 const void *data, uint64_t size,
930                 Object **ret, uint64_t *offset) {
931
932         uint64_t hash;
933
934         assert(f);
935         assert(data || size == 0);
936
937         hash = hash64(data, size);
938
939         return journal_file_find_data_object_with_hash(f,
940                                                        data, size, hash,
941                                                        ret, offset);
942 }
943
944 static int journal_file_append_field(
945                 JournalFile *f,
946                 const void *field, uint64_t size,
947                 Object **ret, uint64_t *offset) {
948
949         uint64_t hash, p;
950         uint64_t osize;
951         Object *o;
952         int r;
953
954         assert(f);
955         assert(field && size > 0);
956
957         hash = hash64(field, size);
958
959         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
960         if (r < 0)
961                 return r;
962         else if (r > 0) {
963
964                 if (ret)
965                         *ret = o;
966
967                 if (offset)
968                         *offset = p;
969
970                 return 0;
971         }
972
973         osize = offsetof(Object, field.payload) + size;
974         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
975         if (r < 0)
976                 return r;
977
978         o->field.hash = htole64(hash);
979         memcpy(o->field.payload, field, size);
980
981         r = journal_file_link_field(f, o, p, hash);
982         if (r < 0)
983                 return r;
984
985         /* The linking might have altered the window, so let's
986          * refresh our pointer */
987         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
988         if (r < 0)
989                 return r;
990
991 #ifdef HAVE_GCRYPT
992         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
993         if (r < 0)
994                 return r;
995 #endif
996
997         if (ret)
998                 *ret = o;
999
1000         if (offset)
1001                 *offset = p;
1002
1003         return 0;
1004 }
1005
1006 static int journal_file_append_data(
1007                 JournalFile *f,
1008                 const void *data, uint64_t size,
1009                 Object **ret, uint64_t *offset) {
1010
1011         uint64_t hash, p;
1012         uint64_t osize;
1013         Object *o;
1014         int r, compression = 0;
1015         const void *eq;
1016
1017         assert(f);
1018         assert(data || size == 0);
1019
1020         hash = hash64(data, size);
1021
1022         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1023         if (r < 0)
1024                 return r;
1025         else if (r > 0) {
1026
1027                 if (ret)
1028                         *ret = o;
1029
1030                 if (offset)
1031                         *offset = p;
1032
1033                 return 0;
1034         }
1035
1036         osize = offsetof(Object, data.payload) + size;
1037         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1038         if (r < 0)
1039                 return r;
1040
1041         o->data.hash = htole64(hash);
1042
1043 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1044         if (f->compress_xz &&
1045             size >= COMPRESSION_SIZE_THRESHOLD) {
1046                 size_t rsize;
1047
1048                 compression = compress_blob(data, size, o->data.payload, &rsize);
1049
1050                 if (compression) {
1051                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1052                         o->object.flags |= compression;
1053
1054                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1055                                   size, rsize, object_compressed_to_string(compression));
1056                 }
1057         }
1058 #endif
1059
1060         if (!compression && size > 0)
1061                 memcpy(o->data.payload, data, size);
1062
1063         r = journal_file_link_data(f, o, p, hash);
1064         if (r < 0)
1065                 return r;
1066
1067         /* The linking might have altered the window, so let's
1068          * refresh our pointer */
1069         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1070         if (r < 0)
1071                 return r;
1072
1073         if (!data)
1074                 eq = NULL;
1075         else
1076                 eq = memchr(data, '=', size);
1077         if (eq && eq > data) {
1078                 Object *fo = NULL;
1079                 uint64_t fp;
1080
1081                 /* Create field object ... */
1082                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1083                 if (r < 0)
1084                         return r;
1085
1086                 /* ... and link it in. */
1087                 o->data.next_field_offset = fo->field.head_data_offset;
1088                 fo->field.head_data_offset = le64toh(p);
1089         }
1090
1091 #ifdef HAVE_GCRYPT
1092         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1093         if (r < 0)
1094                 return r;
1095 #endif
1096
1097         if (ret)
1098                 *ret = o;
1099
1100         if (offset)
1101                 *offset = p;
1102
1103         return 0;
1104 }
1105
1106 uint64_t journal_file_entry_n_items(Object *o) {
1107         assert(o);
1108
1109         if (o->object.type != OBJECT_ENTRY)
1110                 return 0;
1111
1112         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1113 }
1114
1115 uint64_t journal_file_entry_array_n_items(Object *o) {
1116         assert(o);
1117
1118         if (o->object.type != OBJECT_ENTRY_ARRAY)
1119                 return 0;
1120
1121         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1122 }
1123
1124 uint64_t journal_file_hash_table_n_items(Object *o) {
1125         assert(o);
1126
1127         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1128             o->object.type != OBJECT_FIELD_HASH_TABLE)
1129                 return 0;
1130
1131         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1132 }
1133
1134 static int link_entry_into_array(JournalFile *f,
1135                                  le64_t *first,
1136                                  le64_t *idx,
1137                                  uint64_t p) {
1138         int r;
1139         uint64_t n = 0, ap = 0, q, i, a, hidx;
1140         Object *o;
1141
1142         assert(f);
1143         assert(first);
1144         assert(idx);
1145         assert(p > 0);
1146
1147         a = le64toh(*first);
1148         i = hidx = le64toh(*idx);
1149         while (a > 0) {
1150
1151                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1152                 if (r < 0)
1153                         return r;
1154
1155                 n = journal_file_entry_array_n_items(o);
1156                 if (i < n) {
1157                         o->entry_array.items[i] = htole64(p);
1158                         *idx = htole64(hidx + 1);
1159                         return 0;
1160                 }
1161
1162                 i -= n;
1163                 ap = a;
1164                 a = le64toh(o->entry_array.next_entry_array_offset);
1165         }
1166
1167         if (hidx > n)
1168                 n = (hidx+1) * 2;
1169         else
1170                 n = n * 2;
1171
1172         if (n < 4)
1173                 n = 4;
1174
1175         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1176                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1177                                        &o, &q);
1178         if (r < 0)
1179                 return r;
1180
1181 #ifdef HAVE_GCRYPT
1182         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1183         if (r < 0)
1184                 return r;
1185 #endif
1186
1187         o->entry_array.items[i] = htole64(p);
1188
1189         if (ap == 0)
1190                 *first = htole64(q);
1191         else {
1192                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1193                 if (r < 0)
1194                         return r;
1195
1196                 o->entry_array.next_entry_array_offset = htole64(q);
1197         }
1198
1199         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1200                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1201
1202         *idx = htole64(hidx + 1);
1203
1204         return 0;
1205 }
1206
1207 static int link_entry_into_array_plus_one(JournalFile *f,
1208                                           le64_t *extra,
1209                                           le64_t *first,
1210                                           le64_t *idx,
1211                                           uint64_t p) {
1212
1213         int r;
1214
1215         assert(f);
1216         assert(extra);
1217         assert(first);
1218         assert(idx);
1219         assert(p > 0);
1220
1221         if (*idx == 0)
1222                 *extra = htole64(p);
1223         else {
1224                 le64_t i;
1225
1226                 i = htole64(le64toh(*idx) - 1);
1227                 r = link_entry_into_array(f, first, &i, p);
1228                 if (r < 0)
1229                         return r;
1230         }
1231
1232         *idx = htole64(le64toh(*idx) + 1);
1233         return 0;
1234 }
1235
1236 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1237         uint64_t p;
1238         int r;
1239         assert(f);
1240         assert(o);
1241         assert(offset > 0);
1242
1243         p = le64toh(o->entry.items[i].object_offset);
1244         if (p == 0)
1245                 return -EINVAL;
1246
1247         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1248         if (r < 0)
1249                 return r;
1250
1251         return link_entry_into_array_plus_one(f,
1252                                               &o->data.entry_offset,
1253                                               &o->data.entry_array_offset,
1254                                               &o->data.n_entries,
1255                                               offset);
1256 }
1257
1258 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1259         uint64_t n, i;
1260         int r;
1261
1262         assert(f);
1263         assert(o);
1264         assert(offset > 0);
1265
1266         if (o->object.type != OBJECT_ENTRY)
1267                 return -EINVAL;
1268
1269         __sync_synchronize();
1270
1271         /* Link up the entry itself */
1272         r = link_entry_into_array(f,
1273                                   &f->header->entry_array_offset,
1274                                   &f->header->n_entries,
1275                                   offset);
1276         if (r < 0)
1277                 return r;
1278
1279         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1280
1281         if (f->header->head_entry_realtime == 0)
1282                 f->header->head_entry_realtime = o->entry.realtime;
1283
1284         f->header->tail_entry_realtime = o->entry.realtime;
1285         f->header->tail_entry_monotonic = o->entry.monotonic;
1286
1287         f->tail_entry_monotonic_valid = true;
1288
1289         /* Link up the items */
1290         n = journal_file_entry_n_items(o);
1291         for (i = 0; i < n; i++) {
1292                 r = journal_file_link_entry_item(f, o, offset, i);
1293                 if (r < 0)
1294                         return r;
1295         }
1296
1297         return 0;
1298 }
1299
1300 static int journal_file_append_entry_internal(
1301                 JournalFile *f,
1302                 const dual_timestamp *ts,
1303                 uint64_t xor_hash,
1304                 const EntryItem items[], unsigned n_items,
1305                 uint64_t *seqnum,
1306                 Object **ret, uint64_t *offset) {
1307         uint64_t np;
1308         uint64_t osize;
1309         Object *o;
1310         int r;
1311
1312         assert(f);
1313         assert(items || n_items == 0);
1314         assert(ts);
1315
1316         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1317
1318         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1319         if (r < 0)
1320                 return r;
1321
1322         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1323         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1324         o->entry.realtime = htole64(ts->realtime);
1325         o->entry.monotonic = htole64(ts->monotonic);
1326         o->entry.xor_hash = htole64(xor_hash);
1327         o->entry.boot_id = f->header->boot_id;
1328
1329 #ifdef HAVE_GCRYPT
1330         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1331         if (r < 0)
1332                 return r;
1333 #endif
1334
1335         r = journal_file_link_entry(f, o, np);
1336         if (r < 0)
1337                 return r;
1338
1339         if (ret)
1340                 *ret = o;
1341
1342         if (offset)
1343                 *offset = np;
1344
1345         return 0;
1346 }
1347
1348 void journal_file_post_change(JournalFile *f) {
1349         assert(f);
1350
1351         /* inotify() does not receive IN_MODIFY events from file
1352          * accesses done via mmap(). After each access we hence
1353          * trigger IN_MODIFY by truncating the journal file to its
1354          * current size which triggers IN_MODIFY. */
1355
1356         __sync_synchronize();
1357
1358         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1359                 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1360 }
1361
1362 static int entry_item_cmp(const void *_a, const void *_b) {
1363         const EntryItem *a = _a, *b = _b;
1364
1365         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1366                 return -1;
1367         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1368                 return 1;
1369         return 0;
1370 }
1371
1372 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1373         unsigned i;
1374         EntryItem *items;
1375         int r;
1376         uint64_t xor_hash = 0;
1377         struct dual_timestamp _ts;
1378
1379         assert(f);
1380         assert(iovec || n_iovec == 0);
1381
1382         if (!ts) {
1383                 dual_timestamp_get(&_ts);
1384                 ts = &_ts;
1385         }
1386
1387         if (f->tail_entry_monotonic_valid &&
1388             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1389                 return -EINVAL;
1390
1391 #ifdef HAVE_GCRYPT
1392         r = journal_file_maybe_append_tag(f, ts->realtime);
1393         if (r < 0)
1394                 return r;
1395 #endif
1396
1397         /* alloca() can't take 0, hence let's allocate at least one */
1398         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1399
1400         for (i = 0; i < n_iovec; i++) {
1401                 uint64_t p;
1402                 Object *o;
1403
1404                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1405                 if (r < 0)
1406                         return r;
1407
1408                 xor_hash ^= le64toh(o->data.hash);
1409                 items[i].object_offset = htole64(p);
1410                 items[i].hash = o->data.hash;
1411         }
1412
1413         /* Order by the position on disk, in order to improve seek
1414          * times for rotating media. */
1415         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1416
1417         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1418
1419         /* If the memory mapping triggered a SIGBUS then we return an
1420          * IO error and ignore the error code passed down to us, since
1421          * it is very likely just an effect of a nullified replacement
1422          * mapping page */
1423
1424         if (mmap_cache_got_sigbus(f->mmap, f->fd))
1425                 r = -EIO;
1426
1427         journal_file_post_change(f);
1428
1429         return r;
1430 }
1431
1432 typedef struct ChainCacheItem {
1433         uint64_t first; /* the array at the beginning of the chain */
1434         uint64_t array; /* the cached array */
1435         uint64_t begin; /* the first item in the cached array */
1436         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1437         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1438 } ChainCacheItem;
1439
1440 static void chain_cache_put(
1441                 OrderedHashmap *h,
1442                 ChainCacheItem *ci,
1443                 uint64_t first,
1444                 uint64_t array,
1445                 uint64_t begin,
1446                 uint64_t total,
1447                 uint64_t last_index) {
1448
1449         if (!ci) {
1450                 /* If the chain item to cache for this chain is the
1451                  * first one it's not worth caching anything */
1452                 if (array == first)
1453                         return;
1454
1455                 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1456                         ci = ordered_hashmap_steal_first(h);
1457                         assert(ci);
1458                 } else {
1459                         ci = new(ChainCacheItem, 1);
1460                         if (!ci)
1461                                 return;
1462                 }
1463
1464                 ci->first = first;
1465
1466                 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1467                         free(ci);
1468                         return;
1469                 }
1470         } else
1471                 assert(ci->first == first);
1472
1473         ci->array = array;
1474         ci->begin = begin;
1475         ci->total = total;
1476         ci->last_index = last_index;
1477 }
1478
1479 static int generic_array_get(
1480                 JournalFile *f,
1481                 uint64_t first,
1482                 uint64_t i,
1483                 Object **ret, uint64_t *offset) {
1484
1485         Object *o;
1486         uint64_t p = 0, a, t = 0;
1487         int r;
1488         ChainCacheItem *ci;
1489
1490         assert(f);
1491
1492         a = first;
1493
1494         /* Try the chain cache first */
1495         ci = ordered_hashmap_get(f->chain_cache, &first);
1496         if (ci && i > ci->total) {
1497                 a = ci->array;
1498                 i -= ci->total;
1499                 t = ci->total;
1500         }
1501
1502         while (a > 0) {
1503                 uint64_t k;
1504
1505                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1506                 if (r < 0)
1507                         return r;
1508
1509                 k = journal_file_entry_array_n_items(o);
1510                 if (i < k) {
1511                         p = le64toh(o->entry_array.items[i]);
1512                         goto found;
1513                 }
1514
1515                 i -= k;
1516                 t += k;
1517                 a = le64toh(o->entry_array.next_entry_array_offset);
1518         }
1519
1520         return 0;
1521
1522 found:
1523         /* Let's cache this item for the next invocation */
1524         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1525
1526         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1527         if (r < 0)
1528                 return r;
1529
1530         if (ret)
1531                 *ret = o;
1532
1533         if (offset)
1534                 *offset = p;
1535
1536         return 1;
1537 }
1538
1539 static int generic_array_get_plus_one(
1540                 JournalFile *f,
1541                 uint64_t extra,
1542                 uint64_t first,
1543                 uint64_t i,
1544                 Object **ret, uint64_t *offset) {
1545
1546         Object *o;
1547
1548         assert(f);
1549
1550         if (i == 0) {
1551                 int r;
1552
1553                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1554                 if (r < 0)
1555                         return r;
1556
1557                 if (ret)
1558                         *ret = o;
1559
1560                 if (offset)
1561                         *offset = extra;
1562
1563                 return 1;
1564         }
1565
1566         return generic_array_get(f, first, i-1, ret, offset);
1567 }
1568
1569 enum {
1570         TEST_FOUND,
1571         TEST_LEFT,
1572         TEST_RIGHT
1573 };
1574
1575 static int generic_array_bisect(
1576                 JournalFile *f,
1577                 uint64_t first,
1578                 uint64_t n,
1579                 uint64_t needle,
1580                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1581                 direction_t direction,
1582                 Object **ret,
1583                 uint64_t *offset,
1584                 uint64_t *idx) {
1585
1586         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1587         bool subtract_one = false;
1588         Object *o, *array = NULL;
1589         int r;
1590         ChainCacheItem *ci;
1591
1592         assert(f);
1593         assert(test_object);
1594
1595         /* Start with the first array in the chain */
1596         a = first;
1597
1598         ci = ordered_hashmap_get(f->chain_cache, &first);
1599         if (ci && n > ci->total) {
1600                 /* Ah, we have iterated this bisection array chain
1601                  * previously! Let's see if we can skip ahead in the
1602                  * chain, as far as the last time. But we can't jump
1603                  * backwards in the chain, so let's check that
1604                  * first. */
1605
1606                 r = test_object(f, ci->begin, needle);
1607                 if (r < 0)
1608                         return r;
1609
1610                 if (r == TEST_LEFT) {
1611                         /* OK, what we are looking for is right of the
1612                          * begin of this EntryArray, so let's jump
1613                          * straight to previously cached array in the
1614                          * chain */
1615
1616                         a = ci->array;
1617                         n -= ci->total;
1618                         t = ci->total;
1619                         last_index = ci->last_index;
1620                 }
1621         }
1622
1623         while (a > 0) {
1624                 uint64_t left, right, k, lp;
1625
1626                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1627                 if (r < 0)
1628                         return r;
1629
1630                 k = journal_file_entry_array_n_items(array);
1631                 right = MIN(k, n);
1632                 if (right <= 0)
1633                         return 0;
1634
1635                 i = right - 1;
1636                 lp = p = le64toh(array->entry_array.items[i]);
1637                 if (p <= 0)
1638                         return -EBADMSG;
1639
1640                 r = test_object(f, p, needle);
1641                 if (r < 0)
1642                         return r;
1643
1644                 if (r == TEST_FOUND)
1645                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1646
1647                 if (r == TEST_RIGHT) {
1648                         left = 0;
1649                         right -= 1;
1650
1651                         if (last_index != (uint64_t) -1) {
1652                                 assert(last_index <= right);
1653
1654                                 /* If we cached the last index we
1655                                  * looked at, let's try to not to jump
1656                                  * too wildly around and see if we can
1657                                  * limit the range to look at early to
1658                                  * the immediate neighbors of the last
1659                                  * index we looked at. */
1660
1661                                 if (last_index > 0) {
1662                                         uint64_t x = last_index - 1;
1663
1664                                         p = le64toh(array->entry_array.items[x]);
1665                                         if (p <= 0)
1666                                                 return -EBADMSG;
1667
1668                                         r = test_object(f, p, needle);
1669                                         if (r < 0)
1670                                                 return r;
1671
1672                                         if (r == TEST_FOUND)
1673                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1674
1675                                         if (r == TEST_RIGHT)
1676                                                 right = x;
1677                                         else
1678                                                 left = x + 1;
1679                                 }
1680
1681                                 if (last_index < right) {
1682                                         uint64_t y = last_index + 1;
1683
1684                                         p = le64toh(array->entry_array.items[y]);
1685                                         if (p <= 0)
1686                                                 return -EBADMSG;
1687
1688                                         r = test_object(f, p, needle);
1689                                         if (r < 0)
1690                                                 return r;
1691
1692                                         if (r == TEST_FOUND)
1693                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1694
1695                                         if (r == TEST_RIGHT)
1696                                                 right = y;
1697                                         else
1698                                                 left = y + 1;
1699                                 }
1700                         }
1701
1702                         for (;;) {
1703                                 if (left == right) {
1704                                         if (direction == DIRECTION_UP)
1705                                                 subtract_one = true;
1706
1707                                         i = left;
1708                                         goto found;
1709                                 }
1710
1711                                 assert(left < right);
1712                                 i = (left + right) / 2;
1713
1714                                 p = le64toh(array->entry_array.items[i]);
1715                                 if (p <= 0)
1716                                         return -EBADMSG;
1717
1718                                 r = test_object(f, p, needle);
1719                                 if (r < 0)
1720                                         return r;
1721
1722                                 if (r == TEST_FOUND)
1723                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1724
1725                                 if (r == TEST_RIGHT)
1726                                         right = i;
1727                                 else
1728                                         left = i + 1;
1729                         }
1730                 }
1731
1732                 if (k >= n) {
1733                         if (direction == DIRECTION_UP) {
1734                                 i = n;
1735                                 subtract_one = true;
1736                                 goto found;
1737                         }
1738
1739                         return 0;
1740                 }
1741
1742                 last_p = lp;
1743
1744                 n -= k;
1745                 t += k;
1746                 last_index = (uint64_t) -1;
1747                 a = le64toh(array->entry_array.next_entry_array_offset);
1748         }
1749
1750         return 0;
1751
1752 found:
1753         if (subtract_one && t == 0 && i == 0)
1754                 return 0;
1755
1756         /* Let's cache this item for the next invocation */
1757         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1758
1759         if (subtract_one && i == 0)
1760                 p = last_p;
1761         else if (subtract_one)
1762                 p = le64toh(array->entry_array.items[i-1]);
1763         else
1764                 p = le64toh(array->entry_array.items[i]);
1765
1766         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1767         if (r < 0)
1768                 return r;
1769
1770         if (ret)
1771                 *ret = o;
1772
1773         if (offset)
1774                 *offset = p;
1775
1776         if (idx)
1777                 *idx = t + i + (subtract_one ? -1 : 0);
1778
1779         return 1;
1780 }
1781
1782 static int generic_array_bisect_plus_one(
1783                 JournalFile *f,
1784                 uint64_t extra,
1785                 uint64_t first,
1786                 uint64_t n,
1787                 uint64_t needle,
1788                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1789                 direction_t direction,
1790                 Object **ret,
1791                 uint64_t *offset,
1792                 uint64_t *idx) {
1793
1794         int r;
1795         bool step_back = false;
1796         Object *o;
1797
1798         assert(f);
1799         assert(test_object);
1800
1801         if (n <= 0)
1802                 return 0;
1803
1804         /* This bisects the array in object 'first', but first checks
1805          * an extra  */
1806         r = test_object(f, extra, needle);
1807         if (r < 0)
1808                 return r;
1809
1810         if (r == TEST_FOUND)
1811                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1812
1813         /* if we are looking with DIRECTION_UP then we need to first
1814            see if in the actual array there is a matching entry, and
1815            return the last one of that. But if there isn't any we need
1816            to return this one. Hence remember this, and return it
1817            below. */
1818         if (r == TEST_LEFT)
1819                 step_back = direction == DIRECTION_UP;
1820
1821         if (r == TEST_RIGHT) {
1822                 if (direction == DIRECTION_DOWN)
1823                         goto found;
1824                 else
1825                         return 0;
1826         }
1827
1828         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1829
1830         if (r == 0 && step_back)
1831                 goto found;
1832
1833         if (r > 0 && idx)
1834                 (*idx) ++;
1835
1836         return r;
1837
1838 found:
1839         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1840         if (r < 0)
1841                 return r;
1842
1843         if (ret)
1844                 *ret = o;
1845
1846         if (offset)
1847                 *offset = extra;
1848
1849         if (idx)
1850                 *idx = 0;
1851
1852         return 1;
1853 }
1854
1855 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1856         assert(f);
1857         assert(p > 0);
1858
1859         if (p == needle)
1860                 return TEST_FOUND;
1861         else if (p < needle)
1862                 return TEST_LEFT;
1863         else
1864                 return TEST_RIGHT;
1865 }
1866
1867 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1868         Object *o;
1869         int r;
1870
1871         assert(f);
1872         assert(p > 0);
1873
1874         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1875         if (r < 0)
1876                 return r;
1877
1878         if (le64toh(o->entry.seqnum) == needle)
1879                 return TEST_FOUND;
1880         else if (le64toh(o->entry.seqnum) < needle)
1881                 return TEST_LEFT;
1882         else
1883                 return TEST_RIGHT;
1884 }
1885
1886 int journal_file_move_to_entry_by_seqnum(
1887                 JournalFile *f,
1888                 uint64_t seqnum,
1889                 direction_t direction,
1890                 Object **ret,
1891                 uint64_t *offset) {
1892
1893         return generic_array_bisect(f,
1894                                     le64toh(f->header->entry_array_offset),
1895                                     le64toh(f->header->n_entries),
1896                                     seqnum,
1897                                     test_object_seqnum,
1898                                     direction,
1899                                     ret, offset, NULL);
1900 }
1901
1902 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1903         Object *o;
1904         int r;
1905
1906         assert(f);
1907         assert(p > 0);
1908
1909         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1910         if (r < 0)
1911                 return r;
1912
1913         if (le64toh(o->entry.realtime) == needle)
1914                 return TEST_FOUND;
1915         else if (le64toh(o->entry.realtime) < needle)
1916                 return TEST_LEFT;
1917         else
1918                 return TEST_RIGHT;
1919 }
1920
1921 int journal_file_move_to_entry_by_realtime(
1922                 JournalFile *f,
1923                 uint64_t realtime,
1924                 direction_t direction,
1925                 Object **ret,
1926                 uint64_t *offset) {
1927
1928         return generic_array_bisect(f,
1929                                     le64toh(f->header->entry_array_offset),
1930                                     le64toh(f->header->n_entries),
1931                                     realtime,
1932                                     test_object_realtime,
1933                                     direction,
1934                                     ret, offset, NULL);
1935 }
1936
1937 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1938         Object *o;
1939         int r;
1940
1941         assert(f);
1942         assert(p > 0);
1943
1944         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1945         if (r < 0)
1946                 return r;
1947
1948         if (le64toh(o->entry.monotonic) == needle)
1949                 return TEST_FOUND;
1950         else if (le64toh(o->entry.monotonic) < needle)
1951                 return TEST_LEFT;
1952         else
1953                 return TEST_RIGHT;
1954 }
1955
1956 static inline int find_data_object_by_boot_id(
1957                 JournalFile *f,
1958                 sd_id128_t boot_id,
1959                 Object **o,
1960                 uint64_t *b) {
1961         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1962
1963         sd_id128_to_string(boot_id, t + 9);
1964         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1965 }
1966
1967 int journal_file_move_to_entry_by_monotonic(
1968                 JournalFile *f,
1969                 sd_id128_t boot_id,
1970                 uint64_t monotonic,
1971                 direction_t direction,
1972                 Object **ret,
1973                 uint64_t *offset) {
1974
1975         Object *o;
1976         int r;
1977
1978         assert(f);
1979
1980         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1981         if (r < 0)
1982                 return r;
1983         if (r == 0)
1984                 return -ENOENT;
1985
1986         return generic_array_bisect_plus_one(f,
1987                                              le64toh(o->data.entry_offset),
1988                                              le64toh(o->data.entry_array_offset),
1989                                              le64toh(o->data.n_entries),
1990                                              monotonic,
1991                                              test_object_monotonic,
1992                                              direction,
1993                                              ret, offset, NULL);
1994 }
1995
1996 void journal_file_reset_location(JournalFile *f) {
1997         f->location_type = LOCATION_HEAD;
1998         f->current_offset = 0;
1999         f->current_seqnum = 0;
2000         f->current_realtime = 0;
2001         f->current_monotonic = 0;
2002         zero(f->current_boot_id);
2003         f->current_xor_hash = 0;
2004 }
2005
2006 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2007         f->last_direction = direction;
2008         f->location_type = LOCATION_SEEK;
2009         f->current_offset = offset;
2010         f->current_seqnum = le64toh(o->entry.seqnum);
2011         f->current_realtime = le64toh(o->entry.realtime);
2012         f->current_monotonic = le64toh(o->entry.monotonic);
2013         f->current_boot_id = o->entry.boot_id;
2014         f->current_xor_hash = le64toh(o->entry.xor_hash);
2015 }
2016
2017 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2018         assert(af);
2019         assert(bf);
2020         assert(af->location_type == LOCATION_SEEK);
2021         assert(bf->location_type == LOCATION_SEEK);
2022
2023         /* If contents and timestamps match, these entries are
2024          * identical, even if the seqnum does not match */
2025         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2026             af->current_monotonic == bf->current_monotonic &&
2027             af->current_realtime == bf->current_realtime &&
2028             af->current_xor_hash == bf->current_xor_hash)
2029                 return 0;
2030
2031         if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2032
2033                 /* If this is from the same seqnum source, compare
2034                  * seqnums */
2035                 if (af->current_seqnum < bf->current_seqnum)
2036                         return -1;
2037                 if (af->current_seqnum > bf->current_seqnum)
2038                         return 1;
2039
2040                 /* Wow! This is weird, different data but the same
2041                  * seqnums? Something is borked, but let's make the
2042                  * best of it and compare by time. */
2043         }
2044
2045         if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2046
2047                 /* If the boot id matches, compare monotonic time */
2048                 if (af->current_monotonic < bf->current_monotonic)
2049                         return -1;
2050                 if (af->current_monotonic > bf->current_monotonic)
2051                         return 1;
2052         }
2053
2054         /* Otherwise, compare UTC time */
2055         if (af->current_realtime < bf->current_realtime)
2056                 return -1;
2057         if (af->current_realtime > bf->current_realtime)
2058                 return 1;
2059
2060         /* Finally, compare by contents */
2061         if (af->current_xor_hash < bf->current_xor_hash)
2062                 return -1;
2063         if (af->current_xor_hash > bf->current_xor_hash)
2064                 return 1;
2065
2066         return 0;
2067 }
2068
2069 int journal_file_next_entry(
2070                 JournalFile *f,
2071                 uint64_t p,
2072                 direction_t direction,
2073                 Object **ret, uint64_t *offset) {
2074
2075         uint64_t i, n, ofs;
2076         int r;
2077
2078         assert(f);
2079
2080         n = le64toh(f->header->n_entries);
2081         if (n <= 0)
2082                 return 0;
2083
2084         if (p == 0)
2085                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2086         else {
2087                 r = generic_array_bisect(f,
2088                                          le64toh(f->header->entry_array_offset),
2089                                          le64toh(f->header->n_entries),
2090                                          p,
2091                                          test_object_offset,
2092                                          DIRECTION_DOWN,
2093                                          NULL, NULL,
2094                                          &i);
2095                 if (r <= 0)
2096                         return r;
2097
2098                 if (direction == DIRECTION_DOWN) {
2099                         if (i >= n - 1)
2100                                 return 0;
2101
2102                         i++;
2103                 } else {
2104                         if (i <= 0)
2105                                 return 0;
2106
2107                         i--;
2108                 }
2109         }
2110
2111         /* And jump to it */
2112         r = generic_array_get(f,
2113                               le64toh(f->header->entry_array_offset),
2114                               i,
2115                               ret, &ofs);
2116         if (r <= 0)
2117                 return r;
2118
2119         if (p > 0 &&
2120             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2121                 log_debug("%s: entry array corrupted at entry %"PRIu64,
2122                           f->path, i);
2123                 return -EBADMSG;
2124         }
2125
2126         if (offset)
2127                 *offset = ofs;
2128
2129         return 1;
2130 }
2131
2132 int journal_file_next_entry_for_data(
2133                 JournalFile *f,
2134                 Object *o, uint64_t p,
2135                 uint64_t data_offset,
2136                 direction_t direction,
2137                 Object **ret, uint64_t *offset) {
2138
2139         uint64_t n, i;
2140         int r;
2141         Object *d;
2142
2143         assert(f);
2144         assert(p > 0 || !o);
2145
2146         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2147         if (r < 0)
2148                 return r;
2149
2150         n = le64toh(d->data.n_entries);
2151         if (n <= 0)
2152                 return n;
2153
2154         if (!o)
2155                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2156         else {
2157                 if (o->object.type != OBJECT_ENTRY)
2158                         return -EINVAL;
2159
2160                 r = generic_array_bisect_plus_one(f,
2161                                                   le64toh(d->data.entry_offset),
2162                                                   le64toh(d->data.entry_array_offset),
2163                                                   le64toh(d->data.n_entries),
2164                                                   p,
2165                                                   test_object_offset,
2166                                                   DIRECTION_DOWN,
2167                                                   NULL, NULL,
2168                                                   &i);
2169
2170                 if (r <= 0)
2171                         return r;
2172
2173                 if (direction == DIRECTION_DOWN) {
2174                         if (i >= n - 1)
2175                                 return 0;
2176
2177                         i++;
2178                 } else {
2179                         if (i <= 0)
2180                                 return 0;
2181
2182                         i--;
2183                 }
2184
2185         }
2186
2187         return generic_array_get_plus_one(f,
2188                                           le64toh(d->data.entry_offset),
2189                                           le64toh(d->data.entry_array_offset),
2190                                           i,
2191                                           ret, offset);
2192 }
2193
2194 int journal_file_move_to_entry_by_offset_for_data(
2195                 JournalFile *f,
2196                 uint64_t data_offset,
2197                 uint64_t p,
2198                 direction_t direction,
2199                 Object **ret, uint64_t *offset) {
2200
2201         int r;
2202         Object *d;
2203
2204         assert(f);
2205
2206         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2207         if (r < 0)
2208                 return r;
2209
2210         return generic_array_bisect_plus_one(f,
2211                                              le64toh(d->data.entry_offset),
2212                                              le64toh(d->data.entry_array_offset),
2213                                              le64toh(d->data.n_entries),
2214                                              p,
2215                                              test_object_offset,
2216                                              direction,
2217                                              ret, offset, NULL);
2218 }
2219
2220 int journal_file_move_to_entry_by_monotonic_for_data(
2221                 JournalFile *f,
2222                 uint64_t data_offset,
2223                 sd_id128_t boot_id,
2224                 uint64_t monotonic,
2225                 direction_t direction,
2226                 Object **ret, uint64_t *offset) {
2227
2228         Object *o, *d;
2229         int r;
2230         uint64_t b, z;
2231
2232         assert(f);
2233
2234         /* First, seek by time */
2235         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2236         if (r < 0)
2237                 return r;
2238         if (r == 0)
2239                 return -ENOENT;
2240
2241         r = generic_array_bisect_plus_one(f,
2242                                           le64toh(o->data.entry_offset),
2243                                           le64toh(o->data.entry_array_offset),
2244                                           le64toh(o->data.n_entries),
2245                                           monotonic,
2246                                           test_object_monotonic,
2247                                           direction,
2248                                           NULL, &z, NULL);
2249         if (r <= 0)
2250                 return r;
2251
2252         /* And now, continue seeking until we find an entry that
2253          * exists in both bisection arrays */
2254
2255         for (;;) {
2256                 Object *qo;
2257                 uint64_t p, q;
2258
2259                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2260                 if (r < 0)
2261                         return r;
2262
2263                 r = generic_array_bisect_plus_one(f,
2264                                                   le64toh(d->data.entry_offset),
2265                                                   le64toh(d->data.entry_array_offset),
2266                                                   le64toh(d->data.n_entries),
2267                                                   z,
2268                                                   test_object_offset,
2269                                                   direction,
2270                                                   NULL, &p, NULL);
2271                 if (r <= 0)
2272                         return r;
2273
2274                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2275                 if (r < 0)
2276                         return r;
2277
2278                 r = generic_array_bisect_plus_one(f,
2279                                                   le64toh(o->data.entry_offset),
2280                                                   le64toh(o->data.entry_array_offset),
2281                                                   le64toh(o->data.n_entries),
2282                                                   p,
2283                                                   test_object_offset,
2284                                                   direction,
2285                                                   &qo, &q, NULL);
2286
2287                 if (r <= 0)
2288                         return r;
2289
2290                 if (p == q) {
2291                         if (ret)
2292                                 *ret = qo;
2293                         if (offset)
2294                                 *offset = q;
2295
2296                         return 1;
2297                 }
2298
2299                 z = q;
2300         }
2301 }
2302
2303 int journal_file_move_to_entry_by_seqnum_for_data(
2304                 JournalFile *f,
2305                 uint64_t data_offset,
2306                 uint64_t seqnum,
2307                 direction_t direction,
2308                 Object **ret, uint64_t *offset) {
2309
2310         Object *d;
2311         int r;
2312
2313         assert(f);
2314
2315         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2316         if (r < 0)
2317                 return r;
2318
2319         return generic_array_bisect_plus_one(f,
2320                                              le64toh(d->data.entry_offset),
2321                                              le64toh(d->data.entry_array_offset),
2322                                              le64toh(d->data.n_entries),
2323                                              seqnum,
2324                                              test_object_seqnum,
2325                                              direction,
2326                                              ret, offset, NULL);
2327 }
2328
2329 int journal_file_move_to_entry_by_realtime_for_data(
2330                 JournalFile *f,
2331                 uint64_t data_offset,
2332                 uint64_t realtime,
2333                 direction_t direction,
2334                 Object **ret, uint64_t *offset) {
2335
2336         Object *d;
2337         int r;
2338
2339         assert(f);
2340
2341         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2342         if (r < 0)
2343                 return r;
2344
2345         return generic_array_bisect_plus_one(f,
2346                                              le64toh(d->data.entry_offset),
2347                                              le64toh(d->data.entry_array_offset),
2348                                              le64toh(d->data.n_entries),
2349                                              realtime,
2350                                              test_object_realtime,
2351                                              direction,
2352                                              ret, offset, NULL);
2353 }
2354
2355 void journal_file_dump(JournalFile *f) {
2356         Object *o;
2357         int r;
2358         uint64_t p;
2359
2360         assert(f);
2361
2362         journal_file_print_header(f);
2363
2364         p = le64toh(f->header->header_size);
2365         while (p != 0) {
2366                 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2367                 if (r < 0)
2368                         goto fail;
2369
2370                 switch (o->object.type) {
2371
2372                 case OBJECT_UNUSED:
2373                         printf("Type: OBJECT_UNUSED\n");
2374                         break;
2375
2376                 case OBJECT_DATA:
2377                         printf("Type: OBJECT_DATA\n");
2378                         break;
2379
2380                 case OBJECT_FIELD:
2381                         printf("Type: OBJECT_FIELD\n");
2382                         break;
2383
2384                 case OBJECT_ENTRY:
2385                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2386                                le64toh(o->entry.seqnum),
2387                                le64toh(o->entry.monotonic),
2388                                le64toh(o->entry.realtime));
2389                         break;
2390
2391                 case OBJECT_FIELD_HASH_TABLE:
2392                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2393                         break;
2394
2395                 case OBJECT_DATA_HASH_TABLE:
2396                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2397                         break;
2398
2399                 case OBJECT_ENTRY_ARRAY:
2400                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2401                         break;
2402
2403                 case OBJECT_TAG:
2404                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2405                                le64toh(o->tag.seqnum),
2406                                le64toh(o->tag.epoch));
2407                         break;
2408
2409                 default:
2410                         printf("Type: unknown (%u)\n", o->object.type);
2411                         break;
2412                 }
2413
2414                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2415                         printf("Flags: %s\n",
2416                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2417
2418                 if (p == le64toh(f->header->tail_object_offset))
2419                         p = 0;
2420                 else
2421                         p = p + ALIGN64(le64toh(o->object.size));
2422         }
2423
2424         return;
2425 fail:
2426         log_error("File corrupt");
2427 }
2428
2429 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2430         const char *x;
2431
2432         x = format_timestamp(buf, l, t);
2433         if (x)
2434                 return x;
2435         return " --- ";
2436 }
2437
2438 void journal_file_print_header(JournalFile *f) {
2439         char a[33], b[33], c[33], d[33];
2440         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2441         struct stat st;
2442         char bytes[FORMAT_BYTES_MAX];
2443
2444         assert(f);
2445
2446         printf("File Path: %s\n"
2447                "File ID: %s\n"
2448                "Machine ID: %s\n"
2449                "Boot ID: %s\n"
2450                "Sequential Number ID: %s\n"
2451                "State: %s\n"
2452                "Compatible Flags:%s%s\n"
2453                "Incompatible Flags:%s%s%s\n"
2454                "Header size: %"PRIu64"\n"
2455                "Arena size: %"PRIu64"\n"
2456                "Data Hash Table Size: %"PRIu64"\n"
2457                "Field Hash Table Size: %"PRIu64"\n"
2458                "Rotate Suggested: %s\n"
2459                "Head Sequential Number: %"PRIu64"\n"
2460                "Tail Sequential Number: %"PRIu64"\n"
2461                "Head Realtime Timestamp: %s\n"
2462                "Tail Realtime Timestamp: %s\n"
2463                "Tail Monotonic Timestamp: %s\n"
2464                "Objects: %"PRIu64"\n"
2465                "Entry Objects: %"PRIu64"\n",
2466                f->path,
2467                sd_id128_to_string(f->header->file_id, a),
2468                sd_id128_to_string(f->header->machine_id, b),
2469                sd_id128_to_string(f->header->boot_id, c),
2470                sd_id128_to_string(f->header->seqnum_id, d),
2471                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2472                f->header->state == STATE_ONLINE ? "ONLINE" :
2473                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2474                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2475                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2476                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2477                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2478                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2479                le64toh(f->header->header_size),
2480                le64toh(f->header->arena_size),
2481                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2482                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2483                yes_no(journal_file_rotate_suggested(f, 0)),
2484                le64toh(f->header->head_entry_seqnum),
2485                le64toh(f->header->tail_entry_seqnum),
2486                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2487                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2488                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2489                le64toh(f->header->n_objects),
2490                le64toh(f->header->n_entries));
2491
2492         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2493                 printf("Data Objects: %"PRIu64"\n"
2494                        "Data Hash Table Fill: %.1f%%\n",
2495                        le64toh(f->header->n_data),
2496                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2497
2498         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2499                 printf("Field Objects: %"PRIu64"\n"
2500                        "Field Hash Table Fill: %.1f%%\n",
2501                        le64toh(f->header->n_fields),
2502                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2503
2504         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2505                 printf("Tag Objects: %"PRIu64"\n",
2506                        le64toh(f->header->n_tags));
2507         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2508                 printf("Entry Array Objects: %"PRIu64"\n",
2509                        le64toh(f->header->n_entry_arrays));
2510
2511         if (fstat(f->fd, &st) >= 0)
2512                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2513 }
2514
2515 int journal_file_open(
2516                 const char *fname,
2517                 int flags,
2518                 mode_t mode,
2519                 bool compress,
2520                 bool seal,
2521                 JournalMetrics *metrics,
2522                 MMapCache *mmap_cache,
2523                 JournalFile *template,
2524                 JournalFile **ret) {
2525
2526         bool newly_created = false;
2527         JournalFile *f;
2528         void *h;
2529         int r;
2530
2531         assert(fname);
2532         assert(ret);
2533
2534         if ((flags & O_ACCMODE) != O_RDONLY &&
2535             (flags & O_ACCMODE) != O_RDWR)
2536                 return -EINVAL;
2537
2538         if (!endswith(fname, ".journal") &&
2539             !endswith(fname, ".journal~"))
2540                 return -EINVAL;
2541
2542         f = new0(JournalFile, 1);
2543         if (!f)
2544                 return -ENOMEM;
2545
2546         f->fd = -1;
2547         f->mode = mode;
2548
2549         f->flags = flags;
2550         f->prot = prot_from_flags(flags);
2551         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2552 #if defined(HAVE_LZ4)
2553         f->compress_lz4 = compress;
2554 #elif defined(HAVE_XZ)
2555         f->compress_xz = compress;
2556 #endif
2557 #ifdef HAVE_GCRYPT
2558         f->seal = seal;
2559 #endif
2560
2561         if (mmap_cache)
2562                 f->mmap = mmap_cache_ref(mmap_cache);
2563         else {
2564                 f->mmap = mmap_cache_new();
2565                 if (!f->mmap) {
2566                         r = -ENOMEM;
2567                         goto fail;
2568                 }
2569         }
2570
2571         f->path = strdup(fname);
2572         if (!f->path) {
2573                 r = -ENOMEM;
2574                 goto fail;
2575         }
2576
2577         f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2578         if (!f->chain_cache) {
2579                 r = -ENOMEM;
2580                 goto fail;
2581         }
2582
2583         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2584         if (f->fd < 0) {
2585                 r = -errno;
2586                 goto fail;
2587         }
2588
2589         r = journal_file_fstat(f);
2590         if (r < 0)
2591                 goto fail;
2592
2593         if (f->last_stat.st_size == 0 && f->writable) {
2594                 /* Let's attach the creation time to the journal file,
2595                  * so that the vacuuming code knows the age of this
2596                  * file even if the file might end up corrupted one
2597                  * day... Ideally we'd just use the creation time many
2598                  * file systems maintain for each file, but there is
2599                  * currently no usable API to query this, hence let's
2600                  * emulate this via extended attributes. If extended
2601                  * attributes are not supported we'll just skip this,
2602                  * and rely solely on mtime/atime/ctime of the file. */
2603
2604                 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2605
2606 #ifdef HAVE_GCRYPT
2607                 /* Try to load the FSPRG state, and if we can't, then
2608                  * just don't do sealing */
2609                 if (f->seal) {
2610                         r = journal_file_fss_load(f);
2611                         if (r < 0)
2612                                 f->seal = false;
2613                 }
2614 #endif
2615
2616                 r = journal_file_init_header(f, template);
2617                 if (r < 0)
2618                         goto fail;
2619
2620                 r = journal_file_fstat(f);
2621                 if (r < 0)
2622                         goto fail;
2623
2624                 newly_created = true;
2625         }
2626
2627         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2628                 r = -EIO;
2629                 goto fail;
2630         }
2631
2632         r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2633         if (r < 0) {
2634                 r = -errno;
2635                 goto fail;
2636         }
2637
2638         f->header = h;
2639
2640         if (!newly_created) {
2641                 r = journal_file_verify_header(f);
2642                 if (r < 0)
2643                         goto fail;
2644         }
2645
2646 #ifdef HAVE_GCRYPT
2647         if (!newly_created && f->writable) {
2648                 r = journal_file_fss_load(f);
2649                 if (r < 0)
2650                         goto fail;
2651         }
2652 #endif
2653
2654         if (f->writable) {
2655                 if (metrics) {
2656                         journal_default_metrics(metrics, f->fd);
2657                         f->metrics = *metrics;
2658                 } else if (template)
2659                         f->metrics = template->metrics;
2660
2661                 r = journal_file_refresh_header(f);
2662                 if (r < 0)
2663                         goto fail;
2664         }
2665
2666 #ifdef HAVE_GCRYPT
2667         r = journal_file_hmac_setup(f);
2668         if (r < 0)
2669                 goto fail;
2670 #endif
2671
2672         if (newly_created) {
2673                 r = journal_file_setup_field_hash_table(f);
2674                 if (r < 0)
2675                         goto fail;
2676
2677                 r = journal_file_setup_data_hash_table(f);
2678                 if (r < 0)
2679                         goto fail;
2680
2681 #ifdef HAVE_GCRYPT
2682                 r = journal_file_append_first_tag(f);
2683                 if (r < 0)
2684                         goto fail;
2685 #endif
2686         }
2687
2688         r = journal_file_map_field_hash_table(f);
2689         if (r < 0)
2690                 goto fail;
2691
2692         r = journal_file_map_data_hash_table(f);
2693         if (r < 0)
2694                 goto fail;
2695
2696         if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2697                 r = -EIO;
2698                 goto fail;
2699         }
2700
2701         *ret = f;
2702         return 0;
2703
2704 fail:
2705         if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2706                 r = -EIO;
2707
2708         journal_file_close(f);
2709
2710         return r;
2711 }
2712
2713 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2714         _cleanup_free_ char *p = NULL;
2715         size_t l;
2716         JournalFile *old_file, *new_file = NULL;
2717         int r;
2718
2719         assert(f);
2720         assert(*f);
2721
2722         old_file = *f;
2723
2724         if (!old_file->writable)
2725                 return -EINVAL;
2726
2727         if (!endswith(old_file->path, ".journal"))
2728                 return -EINVAL;
2729
2730         l = strlen(old_file->path);
2731         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2732                      (int) l - 8, old_file->path,
2733                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2734                      le64toh((*f)->header->head_entry_seqnum),
2735                      le64toh((*f)->header->head_entry_realtime));
2736         if (r < 0)
2737                 return -ENOMEM;
2738
2739         /* Try to rename the file to the archived version. If the file
2740          * already was deleted, we'll get ENOENT, let's ignore that
2741          * case. */
2742         r = rename(old_file->path, p);
2743         if (r < 0 && errno != ENOENT)
2744                 return -errno;
2745
2746         old_file->header->state = STATE_ARCHIVED;
2747
2748         /* Currently, btrfs is not very good with out write patterns
2749          * and fragments heavily. Let's defrag our journal files when
2750          * we archive them */
2751         old_file->defrag_on_close = true;
2752
2753         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2754         journal_file_close(old_file);
2755
2756         *f = new_file;
2757         return r;
2758 }
2759
2760 int journal_file_open_reliably(
2761                 const char *fname,
2762                 int flags,
2763                 mode_t mode,
2764                 bool compress,
2765                 bool seal,
2766                 JournalMetrics *metrics,
2767                 MMapCache *mmap_cache,
2768                 JournalFile *template,
2769                 JournalFile **ret) {
2770
2771         int r;
2772         size_t l;
2773         _cleanup_free_ char *p = NULL;
2774
2775         r = journal_file_open(fname, flags, mode, compress, seal,
2776                               metrics, mmap_cache, template, ret);
2777         if (r != -EBADMSG && /* corrupted */
2778             r != -ENODATA && /* truncated */
2779             r != -EHOSTDOWN && /* other machine */
2780             r != -EPROTONOSUPPORT && /* incompatible feature */
2781             r != -EBUSY && /* unclean shutdown */
2782             r != -ESHUTDOWN && /* already archived */
2783             r != -EIO && /* IO error, including SIGBUS on mmap */
2784             r != -EIDRM /* File has been deleted */)
2785                 return r;
2786
2787         if ((flags & O_ACCMODE) == O_RDONLY)
2788                 return r;
2789
2790         if (!(flags & O_CREAT))
2791                 return r;
2792
2793         if (!endswith(fname, ".journal"))
2794                 return r;
2795
2796         /* The file is corrupted. Rotate it away and try it again (but only once) */
2797
2798         l = strlen(fname);
2799         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2800                      (int) l - 8, fname,
2801                      (unsigned long long) now(CLOCK_REALTIME),
2802                      random_u64()) < 0)
2803                 return -ENOMEM;
2804
2805         r = rename(fname, p);
2806         if (r < 0)
2807                 return -errno;
2808
2809         /* btrfs doesn't cope well with our write pattern and
2810          * fragments heavily. Let's defrag all files we rotate */
2811         (void) btrfs_defrag(p);
2812
2813         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2814
2815         return journal_file_open(fname, flags, mode, compress, seal,
2816                                  metrics, mmap_cache, template, ret);
2817 }
2818
2819 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2820         uint64_t i, n;
2821         uint64_t q, xor_hash = 0;
2822         int r;
2823         EntryItem *items;
2824         dual_timestamp ts;
2825
2826         assert(from);
2827         assert(to);
2828         assert(o);
2829         assert(p);
2830
2831         if (!to->writable)
2832                 return -EPERM;
2833
2834         ts.monotonic = le64toh(o->entry.monotonic);
2835         ts.realtime = le64toh(o->entry.realtime);
2836
2837         n = journal_file_entry_n_items(o);
2838         /* alloca() can't take 0, hence let's allocate at least one */
2839         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2840
2841         for (i = 0; i < n; i++) {
2842                 uint64_t l, h;
2843                 le64_t le_hash;
2844                 size_t t;
2845                 void *data;
2846                 Object *u;
2847
2848                 q = le64toh(o->entry.items[i].object_offset);
2849                 le_hash = o->entry.items[i].hash;
2850
2851                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2852                 if (r < 0)
2853                         return r;
2854
2855                 if (le_hash != o->data.hash)
2856                         return -EBADMSG;
2857
2858                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2859                 t = (size_t) l;
2860
2861                 /* We hit the limit on 32bit machines */
2862                 if ((uint64_t) t != l)
2863                         return -E2BIG;
2864
2865                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2866 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2867                         size_t rsize;
2868
2869                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2870                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2871                         if (r < 0)
2872                                 return r;
2873
2874                         data = from->compress_buffer;
2875                         l = rsize;
2876 #else
2877                         return -EPROTONOSUPPORT;
2878 #endif
2879                 } else
2880                         data = o->data.payload;
2881
2882                 r = journal_file_append_data(to, data, l, &u, &h);
2883                 if (r < 0)
2884                         return r;
2885
2886                 xor_hash ^= le64toh(u->data.hash);
2887                 items[i].object_offset = htole64(h);
2888                 items[i].hash = u->data.hash;
2889
2890                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2891                 if (r < 0)
2892                         return r;
2893         }
2894
2895         r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2896
2897         if (mmap_cache_got_sigbus(to->mmap, to->fd))
2898                 return -EIO;
2899
2900         return r;
2901 }
2902
2903 void journal_default_metrics(JournalMetrics *m, int fd) {
2904         uint64_t fs_size = 0;
2905         struct statvfs ss;
2906         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2907
2908         assert(m);
2909         assert(fd >= 0);
2910
2911         if (fstatvfs(fd, &ss) >= 0)
2912                 fs_size = ss.f_frsize * ss.f_blocks;
2913
2914         if (m->max_use == (uint64_t) -1) {
2915
2916                 if (fs_size > 0) {
2917                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2918
2919                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2920                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2921
2922                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2923                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2924                 } else
2925                         m->max_use = DEFAULT_MAX_USE_LOWER;
2926         } else {
2927                 m->max_use = PAGE_ALIGN(m->max_use);
2928
2929                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2930                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2931         }
2932
2933         if (m->max_size == (uint64_t) -1) {
2934                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2935
2936                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2937                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2938         } else
2939                 m->max_size = PAGE_ALIGN(m->max_size);
2940
2941         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2942                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2943
2944         if (m->max_size*2 > m->max_use)
2945                 m->max_use = m->max_size*2;
2946
2947         if (m->min_size == (uint64_t) -1)
2948                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2949         else {
2950                 m->min_size = PAGE_ALIGN(m->min_size);
2951
2952                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2953                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2954
2955                 if (m->min_size > m->max_size)
2956                         m->max_size = m->min_size;
2957         }
2958
2959         if (m->keep_free == (uint64_t) -1) {
2960
2961                 if (fs_size > 0) {
2962                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2963
2964                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2965                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2966
2967                 } else
2968                         m->keep_free = DEFAULT_KEEP_FREE;
2969         }
2970
2971         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2972                   format_bytes(a, sizeof(a), m->max_use),
2973                   format_bytes(b, sizeof(b), m->max_size),
2974                   format_bytes(c, sizeof(c), m->min_size),
2975                   format_bytes(d, sizeof(d), m->keep_free));
2976 }
2977
2978 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2979         assert(f);
2980         assert(from || to);
2981
2982         if (from) {
2983                 if (f->header->head_entry_realtime == 0)
2984                         return -ENOENT;
2985
2986                 *from = le64toh(f->header->head_entry_realtime);
2987         }
2988
2989         if (to) {
2990                 if (f->header->tail_entry_realtime == 0)
2991                         return -ENOENT;
2992
2993                 *to = le64toh(f->header->tail_entry_realtime);
2994         }
2995
2996         return 1;
2997 }
2998
2999 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3000         Object *o;
3001         uint64_t p;
3002         int r;
3003
3004         assert(f);
3005         assert(from || to);
3006
3007         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3008         if (r <= 0)
3009                 return r;
3010
3011         if (le64toh(o->data.n_entries) <= 0)
3012                 return 0;
3013
3014         if (from) {
3015                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3016                 if (r < 0)
3017                         return r;
3018
3019                 *from = le64toh(o->entry.monotonic);
3020         }
3021
3022         if (to) {
3023                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3024                 if (r < 0)
3025                         return r;
3026
3027                 r = generic_array_get_plus_one(f,
3028                                                le64toh(o->data.entry_offset),
3029                                                le64toh(o->data.entry_array_offset),
3030                                                le64toh(o->data.n_entries)-1,
3031                                                &o, NULL);
3032                 if (r <= 0)
3033                         return r;
3034
3035                 *to = le64toh(o->entry.monotonic);
3036         }
3037
3038         return 1;
3039 }
3040
3041 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3042         assert(f);
3043
3044         /* If we gained new header fields we gained new features,
3045          * hence suggest a rotation */
3046         if (le64toh(f->header->header_size) < sizeof(Header)) {
3047                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3048                 return true;
3049         }
3050
3051         /* Let's check if the hash tables grew over a certain fill
3052          * level (75%, borrowing this value from Java's hash table
3053          * implementation), and if so suggest a rotation. To calculate
3054          * the fill level we need the n_data field, which only exists
3055          * in newer versions. */
3056
3057         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3058                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3059                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3060                                   f->path,
3061                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3062                                   le64toh(f->header->n_data),
3063                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3064                                   (unsigned long long) f->last_stat.st_size,
3065                                   f->last_stat.st_size / le64toh(f->header->n_data));
3066                         return true;
3067                 }
3068
3069         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3070                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3071                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3072                                   f->path,
3073                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3074                                   le64toh(f->header->n_fields),
3075                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3076                         return true;
3077                 }
3078
3079         /* Are the data objects properly indexed by field objects? */
3080         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3081             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3082             le64toh(f->header->n_data) > 0 &&
3083             le64toh(f->header->n_fields) == 0)
3084                 return true;
3085
3086         if (max_file_usec > 0) {
3087                 usec_t t, h;
3088
3089                 h = le64toh(f->header->head_entry_realtime);
3090                 t = now(CLOCK_REALTIME);
3091
3092                 if (h > 0 && t > h + max_file_usec)
3093                         return true;
3094         }
3095
3096         return false;
3097 }