chiark / gitweb /
journal: fix tag ordering check
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 void journal_file_close(JournalFile *f) {
65         assert(f);
66
67         /* Write the final tag */
68         if (f->seal && f->writable)
69                 journal_file_append_tag(f);
70
71         /* Sync everything to disk, before we mark the file offline */
72         if (f->mmap && f->fd >= 0)
73                 mmap_cache_close_fd(f->mmap, f->fd);
74
75         if (f->writable && f->fd >= 0)
76                 fdatasync(f->fd);
77
78         if (f->header) {
79                 /* Mark the file offline. Don't override the archived state if it already is set */
80                 if (f->writable && f->header->state == STATE_ONLINE)
81                         f->header->state = STATE_OFFLINE;
82
83                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
84         }
85
86         if (f->fd >= 0)
87                 close_nointr_nofail(f->fd);
88
89         free(f->path);
90
91         if (f->mmap)
92                 mmap_cache_unref(f->mmap);
93
94 #ifdef HAVE_XZ
95         free(f->compress_buffer);
96 #endif
97
98 #ifdef HAVE_GCRYPT
99         if (f->fss_file)
100                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
101         else if (f->fsprg_state)
102                 free(f->fsprg_state);
103
104         free(f->fsprg_seed);
105
106         if (f->hmac)
107                 gcry_md_close(f->hmac);
108 #endif
109
110         free(f);
111 }
112
113 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
114         Header h;
115         ssize_t k;
116         int r;
117
118         assert(f);
119
120         zero(h);
121         memcpy(h.signature, HEADER_SIGNATURE, 8);
122         h.header_size = htole64(ALIGN64(sizeof(h)));
123
124         h.incompatible_flags =
125                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
126
127         h.compatible_flags =
128                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
129
130         r = sd_id128_randomize(&h.file_id);
131         if (r < 0)
132                 return r;
133
134         if (template) {
135                 h.seqnum_id = template->header->seqnum_id;
136                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
137         } else
138                 h.seqnum_id = h.file_id;
139
140         k = pwrite(f->fd, &h, sizeof(h), 0);
141         if (k < 0)
142                 return -errno;
143
144         if (k != sizeof(h))
145                 return -EIO;
146
147         return 0;
148 }
149
150 static int journal_file_refresh_header(JournalFile *f) {
151         int r;
152         sd_id128_t boot_id;
153
154         assert(f);
155
156         r = sd_id128_get_machine(&f->header->machine_id);
157         if (r < 0)
158                 return r;
159
160         r = sd_id128_get_boot(&boot_id);
161         if (r < 0)
162                 return r;
163
164         if (sd_id128_equal(boot_id, f->header->boot_id))
165                 f->tail_entry_monotonic_valid = true;
166
167         f->header->boot_id = boot_id;
168
169         f->header->state = STATE_ONLINE;
170
171         /* Sync the online state to disk */
172         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
173         fdatasync(f->fd);
174
175         return 0;
176 }
177
178 static int journal_file_verify_header(JournalFile *f) {
179         assert(f);
180
181         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
182                 return -EBADMSG;
183
184         /* In both read and write mode we refuse to open files with
185          * incompatible flags we don't know */
186 #ifdef HAVE_XZ
187         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
188                 return -EPROTONOSUPPORT;
189 #else
190         if (f->header->incompatible_flags != 0)
191                 return -EPROTONOSUPPORT;
192 #endif
193
194         /* When open for writing we refuse to open files with
195          * compatible flags, too */
196         if (f->writable) {
197 #ifdef HAVE_GCRYPT
198                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
199                         return -EPROTONOSUPPORT;
200 #else
201                 if (f->header->compatible_flags != 0)
202                         return -EPROTONOSUPPORT;
203 #endif
204         }
205
206         if (f->header->state >= _STATE_MAX)
207                 return -EBADMSG;
208
209         /* The first addition was n_data, so check that we are at least this large */
210         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
211                 return -EBADMSG;
212
213         if ((le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_SEALED) &&
214                 !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
215                 return -EBADMSG;
216
217         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
218                 return -ENODATA;
219
220         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
221                 return -ENODATA;
222
223         if (!VALID64(f->header->data_hash_table_offset) ||
224             !VALID64(f->header->field_hash_table_offset) ||
225             !VALID64(f->header->tail_object_offset) ||
226             !VALID64(f->header->entry_array_offset))
227                 return -ENODATA;
228
229         if (f->writable) {
230                 uint8_t state;
231                 sd_id128_t machine_id;
232                 int r;
233
234                 r = sd_id128_get_machine(&machine_id);
235                 if (r < 0)
236                         return r;
237
238                 if (!sd_id128_equal(machine_id, f->header->machine_id))
239                         return -EHOSTDOWN;
240
241                 state = f->header->state;
242
243                 if (state == STATE_ONLINE) {
244                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
245                         return -EBUSY;
246                 } else if (state == STATE_ARCHIVED)
247                         return -ESHUTDOWN;
248                 else if (state != STATE_OFFLINE) {
249                         log_debug("Journal file %s has unknown state %u.", f->path, state);
250                         return -EBUSY;
251                 }
252         }
253
254         f->compress = !!(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED);
255
256         if (f->writable)
257                 f->seal = !!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_SEALED);
258
259         return 0;
260 }
261
262 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
263         uint64_t old_size, new_size;
264         int r;
265
266         assert(f);
267
268         /* We assume that this file is not sparse, and we know that
269          * for sure, since we always call posix_fallocate()
270          * ourselves */
271
272         old_size =
273                 le64toh(f->header->header_size) +
274                 le64toh(f->header->arena_size);
275
276         new_size = PAGE_ALIGN(offset + size);
277         if (new_size < le64toh(f->header->header_size))
278                 new_size = le64toh(f->header->header_size);
279
280         if (new_size <= old_size)
281                 return 0;
282
283         if (f->metrics.max_size > 0 &&
284             new_size > f->metrics.max_size)
285                 return -E2BIG;
286
287         if (new_size > f->metrics.min_size &&
288             f->metrics.keep_free > 0) {
289                 struct statvfs svfs;
290
291                 if (fstatvfs(f->fd, &svfs) >= 0) {
292                         uint64_t available;
293
294                         available = svfs.f_bfree * svfs.f_bsize;
295
296                         if (available >= f->metrics.keep_free)
297                                 available -= f->metrics.keep_free;
298                         else
299                                 available = 0;
300
301                         if (new_size - old_size > available)
302                                 return -E2BIG;
303                 }
304         }
305
306         /* Note that the glibc fallocate() fallback is very
307            inefficient, hence we try to minimize the allocation area
308            as we can. */
309         r = posix_fallocate(f->fd, old_size, new_size - old_size);
310         if (r != 0)
311                 return -r;
312
313         mmap_cache_close_fd_range(f->mmap, f->fd, old_size);
314
315         if (fstat(f->fd, &f->last_stat) < 0)
316                 return -errno;
317
318         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
319
320         return 0;
321 }
322
323 static int journal_file_move_to(JournalFile *f, int context, uint64_t offset, uint64_t size, void **ret) {
324         assert(f);
325         assert(ret);
326
327         /* Avoid SIGBUS on invalid accesses */
328         if (offset + size > (uint64_t) f->last_stat.st_size) {
329                 /* Hmm, out of range? Let's refresh the fstat() data
330                  * first, before we trust that check. */
331
332                 if (fstat(f->fd, &f->last_stat) < 0 ||
333                     offset + size > (uint64_t) f->last_stat.st_size)
334                         return -EADDRNOTAVAIL;
335         }
336
337         return mmap_cache_get(f->mmap, f->fd, f->prot, context, offset, size, ret);
338 }
339
340 static uint64_t minimum_header_size(Object *o) {
341
342         static uint64_t table[] = {
343                 [OBJECT_DATA] = sizeof(DataObject),
344                 [OBJECT_FIELD] = sizeof(FieldObject),
345                 [OBJECT_ENTRY] = sizeof(EntryObject),
346                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
347                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
348                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
349                 [OBJECT_TAG] = sizeof(TagObject),
350         };
351
352         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
353                 return sizeof(ObjectHeader);
354
355         return table[o->object.type];
356 }
357
358 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
359         int r;
360         void *t;
361         Object *o;
362         uint64_t s;
363         unsigned context;
364
365         assert(f);
366         assert(ret);
367
368         /* Objects may only be located at multiple of 64 bit */
369         if (!VALID64(offset))
370                 return -EFAULT;
371
372         /* One context for each type, plus one catch-all for the rest */
373         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
374
375         r = journal_file_move_to(f, context, offset, sizeof(ObjectHeader), &t);
376         if (r < 0)
377                 return r;
378
379         o = (Object*) t;
380         s = le64toh(o->object.size);
381
382         if (s < sizeof(ObjectHeader))
383                 return -EBADMSG;
384
385         if (o->object.type <= OBJECT_UNUSED)
386                 return -EBADMSG;
387
388         if (s < minimum_header_size(o))
389                 return -EBADMSG;
390
391         if (type >= 0 && o->object.type != type)
392                 return -EBADMSG;
393
394         if (s > sizeof(ObjectHeader)) {
395                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
396                 if (r < 0)
397                         return r;
398
399                 o = (Object*) t;
400         }
401
402         *ret = o;
403         return 0;
404 }
405
406 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
407         uint64_t r;
408
409         assert(f);
410
411         r = le64toh(f->header->tail_entry_seqnum) + 1;
412
413         if (seqnum) {
414                 /* If an external seqnum counter was passed, we update
415                  * both the local and the external one, and set it to
416                  * the maximum of both */
417
418                 if (*seqnum + 1 > r)
419                         r = *seqnum + 1;
420
421                 *seqnum = r;
422         }
423
424         f->header->tail_entry_seqnum = htole64(r);
425
426         if (f->header->head_entry_seqnum == 0)
427                 f->header->head_entry_seqnum = htole64(r);
428
429         return r;
430 }
431
432 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
433         int r;
434         uint64_t p;
435         Object *tail, *o;
436         void *t;
437
438         assert(f);
439         assert(type > 0 && type < _OBJECT_TYPE_MAX);
440         assert(size >= sizeof(ObjectHeader));
441         assert(offset);
442         assert(ret);
443
444         p = le64toh(f->header->tail_object_offset);
445         if (p == 0)
446                 p = le64toh(f->header->header_size);
447         else {
448                 r = journal_file_move_to_object(f, -1, p, &tail);
449                 if (r < 0)
450                         return r;
451
452                 p += ALIGN64(le64toh(tail->object.size));
453         }
454
455         r = journal_file_allocate(f, p, size);
456         if (r < 0)
457                 return r;
458
459         r = journal_file_move_to(f, type, p, size, &t);
460         if (r < 0)
461                 return r;
462
463         o = (Object*) t;
464
465         zero(o->object);
466         o->object.type = type;
467         o->object.size = htole64(size);
468
469         f->header->tail_object_offset = htole64(p);
470         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
471
472         *ret = o;
473         *offset = p;
474
475         return 0;
476 }
477
478 static int journal_file_setup_data_hash_table(JournalFile *f) {
479         uint64_t s, p;
480         Object *o;
481         int r;
482
483         assert(f);
484
485         /* We estimate that we need 1 hash table entry per 768 of
486            journal file and we want to make sure we never get beyond
487            75% fill level. Calculate the hash table size for the
488            maximum file size based on these metrics. */
489
490         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
491         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
492                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
493
494         log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
495
496         r = journal_file_append_object(f,
497                                        OBJECT_DATA_HASH_TABLE,
498                                        offsetof(Object, hash_table.items) + s,
499                                        &o, &p);
500         if (r < 0)
501                 return r;
502
503         memset(o->hash_table.items, 0, s);
504
505         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
506         f->header->data_hash_table_size = htole64(s);
507
508         return 0;
509 }
510
511 static int journal_file_setup_field_hash_table(JournalFile *f) {
512         uint64_t s, p;
513         Object *o;
514         int r;
515
516         assert(f);
517
518         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
519         r = journal_file_append_object(f,
520                                        OBJECT_FIELD_HASH_TABLE,
521                                        offsetof(Object, hash_table.items) + s,
522                                        &o, &p);
523         if (r < 0)
524                 return r;
525
526         memset(o->hash_table.items, 0, s);
527
528         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
529         f->header->field_hash_table_size = htole64(s);
530
531         return 0;
532 }
533
534 static int journal_file_map_data_hash_table(JournalFile *f) {
535         uint64_t s, p;
536         void *t;
537         int r;
538
539         assert(f);
540
541         p = le64toh(f->header->data_hash_table_offset);
542         s = le64toh(f->header->data_hash_table_size);
543
544         r = journal_file_move_to(f,
545                                  OBJECT_DATA_HASH_TABLE,
546                                  p, s,
547                                  &t);
548         if (r < 0)
549                 return r;
550
551         f->data_hash_table = t;
552         return 0;
553 }
554
555 static int journal_file_map_field_hash_table(JournalFile *f) {
556         uint64_t s, p;
557         void *t;
558         int r;
559
560         assert(f);
561
562         p = le64toh(f->header->field_hash_table_offset);
563         s = le64toh(f->header->field_hash_table_size);
564
565         r = journal_file_move_to(f,
566                                  OBJECT_FIELD_HASH_TABLE,
567                                  p, s,
568                                  &t);
569         if (r < 0)
570                 return r;
571
572         f->field_hash_table = t;
573         return 0;
574 }
575
576 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
577         uint64_t p, h;
578         int r;
579
580         assert(f);
581         assert(o);
582         assert(offset > 0);
583         assert(o->object.type == OBJECT_DATA);
584
585         /* This might alter the window we are looking at */
586
587         o->data.next_hash_offset = o->data.next_field_offset = 0;
588         o->data.entry_offset = o->data.entry_array_offset = 0;
589         o->data.n_entries = 0;
590
591         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
592         p = le64toh(f->data_hash_table[h].tail_hash_offset);
593         if (p == 0) {
594                 /* Only entry in the hash table is easy */
595                 f->data_hash_table[h].head_hash_offset = htole64(offset);
596         } else {
597                 /* Move back to the previous data object, to patch in
598                  * pointer */
599
600                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
601                 if (r < 0)
602                         return r;
603
604                 o->data.next_hash_offset = htole64(offset);
605         }
606
607         f->data_hash_table[h].tail_hash_offset = htole64(offset);
608
609         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
610                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
611
612         return 0;
613 }
614
615 int journal_file_find_data_object_with_hash(
616                 JournalFile *f,
617                 const void *data, uint64_t size, uint64_t hash,
618                 Object **ret, uint64_t *offset) {
619
620         uint64_t p, osize, h;
621         int r;
622
623         assert(f);
624         assert(data || size == 0);
625
626         osize = offsetof(Object, data.payload) + size;
627
628         if (f->header->data_hash_table_size == 0)
629                 return -EBADMSG;
630
631         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
632         p = le64toh(f->data_hash_table[h].head_hash_offset);
633
634         while (p > 0) {
635                 Object *o;
636
637                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
638                 if (r < 0)
639                         return r;
640
641                 if (le64toh(o->data.hash) != hash)
642                         goto next;
643
644                 if (o->object.flags & OBJECT_COMPRESSED) {
645 #ifdef HAVE_XZ
646                         uint64_t l, rsize;
647
648                         l = le64toh(o->object.size);
649                         if (l <= offsetof(Object, data.payload))
650                                 return -EBADMSG;
651
652                         l -= offsetof(Object, data.payload);
653
654                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
655                                 return -EBADMSG;
656
657                         if (rsize == size &&
658                             memcmp(f->compress_buffer, data, size) == 0) {
659
660                                 if (ret)
661                                         *ret = o;
662
663                                 if (offset)
664                                         *offset = p;
665
666                                 return 1;
667                         }
668 #else
669                         return -EPROTONOSUPPORT;
670 #endif
671
672                 } else if (le64toh(o->object.size) == osize &&
673                            memcmp(o->data.payload, data, size) == 0) {
674
675                         if (ret)
676                                 *ret = o;
677
678                         if (offset)
679                                 *offset = p;
680
681                         return 1;
682                 }
683
684         next:
685                 p = le64toh(o->data.next_hash_offset);
686         }
687
688         return 0;
689 }
690
691 int journal_file_find_data_object(
692                 JournalFile *f,
693                 const void *data, uint64_t size,
694                 Object **ret, uint64_t *offset) {
695
696         uint64_t hash;
697
698         assert(f);
699         assert(data || size == 0);
700
701         hash = hash64(data, size);
702
703         return journal_file_find_data_object_with_hash(f,
704                                                        data, size, hash,
705                                                        ret, offset);
706 }
707
708 static int journal_file_append_data(
709                 JournalFile *f,
710                 const void *data, uint64_t size,
711                 Object **ret, uint64_t *offset) {
712
713         uint64_t hash, p;
714         uint64_t osize;
715         Object *o;
716         int r;
717         bool compressed = false;
718
719         assert(f);
720         assert(data || size == 0);
721
722         hash = hash64(data, size);
723
724         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
725         if (r < 0)
726                 return r;
727         else if (r > 0) {
728
729                 if (ret)
730                         *ret = o;
731
732                 if (offset)
733                         *offset = p;
734
735                 return 0;
736         }
737
738         osize = offsetof(Object, data.payload) + size;
739         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
740         if (r < 0)
741                 return r;
742
743         o->data.hash = htole64(hash);
744
745 #ifdef HAVE_XZ
746         if (f->compress &&
747             size >= COMPRESSION_SIZE_THRESHOLD) {
748                 uint64_t rsize;
749
750                 compressed = compress_blob(data, size, o->data.payload, &rsize);
751
752                 if (compressed) {
753                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
754                         o->object.flags |= OBJECT_COMPRESSED;
755
756                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
757                 }
758         }
759 #endif
760
761         if (!compressed && size > 0)
762                 memcpy(o->data.payload, data, size);
763
764         r = journal_file_link_data(f, o, p, hash);
765         if (r < 0)
766                 return r;
767
768         r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
769         if (r < 0)
770                 return r;
771
772         /* The linking might have altered the window, so let's
773          * refresh our pointer */
774         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
775         if (r < 0)
776                 return r;
777
778         if (ret)
779                 *ret = o;
780
781         if (offset)
782                 *offset = p;
783
784         return 0;
785 }
786
787 uint64_t journal_file_entry_n_items(Object *o) {
788         assert(o);
789         assert(o->object.type == OBJECT_ENTRY);
790
791         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
792 }
793
794 uint64_t journal_file_entry_array_n_items(Object *o) {
795         assert(o);
796         assert(o->object.type == OBJECT_ENTRY_ARRAY);
797
798         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
799 }
800
801 uint64_t journal_file_hash_table_n_items(Object *o) {
802         assert(o);
803         assert(o->object.type == OBJECT_DATA_HASH_TABLE ||
804                o->object.type == OBJECT_FIELD_HASH_TABLE);
805
806         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
807 }
808
809 static int link_entry_into_array(JournalFile *f,
810                                  le64_t *first,
811                                  le64_t *idx,
812                                  uint64_t p) {
813         int r;
814         uint64_t n = 0, ap = 0, q, i, a, hidx;
815         Object *o;
816
817         assert(f);
818         assert(first);
819         assert(idx);
820         assert(p > 0);
821
822         a = le64toh(*first);
823         i = hidx = le64toh(*idx);
824         while (a > 0) {
825
826                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
827                 if (r < 0)
828                         return r;
829
830                 n = journal_file_entry_array_n_items(o);
831                 if (i < n) {
832                         o->entry_array.items[i] = htole64(p);
833                         *idx = htole64(hidx + 1);
834                         return 0;
835                 }
836
837                 i -= n;
838                 ap = a;
839                 a = le64toh(o->entry_array.next_entry_array_offset);
840         }
841
842         if (hidx > n)
843                 n = (hidx+1) * 2;
844         else
845                 n = n * 2;
846
847         if (n < 4)
848                 n = 4;
849
850         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
851                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
852                                        &o, &q);
853         if (r < 0)
854                 return r;
855
856         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
857         if (r < 0)
858                 return r;
859
860         o->entry_array.items[i] = htole64(p);
861
862         if (ap == 0)
863                 *first = htole64(q);
864         else {
865                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
866                 if (r < 0)
867                         return r;
868
869                 o->entry_array.next_entry_array_offset = htole64(q);
870         }
871
872         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
873                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
874
875         *idx = htole64(hidx + 1);
876
877         return 0;
878 }
879
880 static int link_entry_into_array_plus_one(JournalFile *f,
881                                           le64_t *extra,
882                                           le64_t *first,
883                                           le64_t *idx,
884                                           uint64_t p) {
885
886         int r;
887
888         assert(f);
889         assert(extra);
890         assert(first);
891         assert(idx);
892         assert(p > 0);
893
894         if (*idx == 0)
895                 *extra = htole64(p);
896         else {
897                 le64_t i;
898
899                 i = htole64(le64toh(*idx) - 1);
900                 r = link_entry_into_array(f, first, &i, p);
901                 if (r < 0)
902                         return r;
903         }
904
905         *idx = htole64(le64toh(*idx) + 1);
906         return 0;
907 }
908
909 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
910         uint64_t p;
911         int r;
912         assert(f);
913         assert(o);
914         assert(offset > 0);
915
916         p = le64toh(o->entry.items[i].object_offset);
917         if (p == 0)
918                 return -EINVAL;
919
920         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
921         if (r < 0)
922                 return r;
923
924         return link_entry_into_array_plus_one(f,
925                                               &o->data.entry_offset,
926                                               &o->data.entry_array_offset,
927                                               &o->data.n_entries,
928                                               offset);
929 }
930
931 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
932         uint64_t n, i;
933         int r;
934
935         assert(f);
936         assert(o);
937         assert(offset > 0);
938         assert(o->object.type == OBJECT_ENTRY);
939
940         __sync_synchronize();
941
942         /* Link up the entry itself */
943         r = link_entry_into_array(f,
944                                   &f->header->entry_array_offset,
945                                   &f->header->n_entries,
946                                   offset);
947         if (r < 0)
948                 return r;
949
950         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
951
952         if (f->header->head_entry_realtime == 0)
953                 f->header->head_entry_realtime = o->entry.realtime;
954
955         f->header->tail_entry_realtime = o->entry.realtime;
956         f->header->tail_entry_monotonic = o->entry.monotonic;
957
958         f->tail_entry_monotonic_valid = true;
959
960         /* Link up the items */
961         n = journal_file_entry_n_items(o);
962         for (i = 0; i < n; i++) {
963                 r = journal_file_link_entry_item(f, o, offset, i);
964                 if (r < 0)
965                         return r;
966         }
967
968         return 0;
969 }
970
971 static int journal_file_append_entry_internal(
972                 JournalFile *f,
973                 const dual_timestamp *ts,
974                 uint64_t xor_hash,
975                 const EntryItem items[], unsigned n_items,
976                 uint64_t *seqnum,
977                 Object **ret, uint64_t *offset) {
978         uint64_t np;
979         uint64_t osize;
980         Object *o;
981         int r;
982
983         assert(f);
984         assert(items || n_items == 0);
985         assert(ts);
986
987         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
988
989         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
990         if (r < 0)
991                 return r;
992
993         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
994         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
995         o->entry.realtime = htole64(ts->realtime);
996         o->entry.monotonic = htole64(ts->monotonic);
997         o->entry.xor_hash = htole64(xor_hash);
998         o->entry.boot_id = f->header->boot_id;
999
1000         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
1001         if (r < 0)
1002                 return r;
1003
1004         r = journal_file_link_entry(f, o, np);
1005         if (r < 0)
1006                 return r;
1007
1008         if (ret)
1009                 *ret = o;
1010
1011         if (offset)
1012                 *offset = np;
1013
1014         return 0;
1015 }
1016
1017 void journal_file_post_change(JournalFile *f) {
1018         assert(f);
1019
1020         /* inotify() does not receive IN_MODIFY events from file
1021          * accesses done via mmap(). After each access we hence
1022          * trigger IN_MODIFY by truncating the journal file to its
1023          * current size which triggers IN_MODIFY. */
1024
1025         __sync_synchronize();
1026
1027         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1028                 log_error("Failed to to truncate file to its own size: %m");
1029 }
1030
1031 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1032         unsigned i;
1033         EntryItem *items;
1034         int r;
1035         uint64_t xor_hash = 0;
1036         struct dual_timestamp _ts;
1037
1038         assert(f);
1039         assert(iovec || n_iovec == 0);
1040
1041         if (!f->writable)
1042                 return -EPERM;
1043
1044         if (!ts) {
1045                 dual_timestamp_get(&_ts);
1046                 ts = &_ts;
1047         }
1048
1049         if (f->tail_entry_monotonic_valid &&
1050             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1051                 return -EINVAL;
1052
1053         r = journal_file_maybe_append_tag(f, ts->realtime);
1054         if (r < 0)
1055                 return r;
1056
1057         /* alloca() can't take 0, hence let's allocate at least one */
1058         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1059
1060         for (i = 0; i < n_iovec; i++) {
1061                 uint64_t p;
1062                 Object *o;
1063
1064                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1065                 if (r < 0)
1066                         return r;
1067
1068                 xor_hash ^= le64toh(o->data.hash);
1069                 items[i].object_offset = htole64(p);
1070                 items[i].hash = o->data.hash;
1071         }
1072
1073         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1074
1075         journal_file_post_change(f);
1076
1077         return r;
1078 }
1079
1080 static int generic_array_get(JournalFile *f,
1081                              uint64_t first,
1082                              uint64_t i,
1083                              Object **ret, uint64_t *offset) {
1084
1085         Object *o;
1086         uint64_t p = 0, a;
1087         int r;
1088
1089         assert(f);
1090
1091         a = first;
1092         while (a > 0) {
1093                 uint64_t n;
1094
1095                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1096                 if (r < 0)
1097                         return r;
1098
1099                 n = journal_file_entry_array_n_items(o);
1100                 if (i < n) {
1101                         p = le64toh(o->entry_array.items[i]);
1102                         break;
1103                 }
1104
1105                 i -= n;
1106                 a = le64toh(o->entry_array.next_entry_array_offset);
1107         }
1108
1109         if (a <= 0 || p <= 0)
1110                 return 0;
1111
1112         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1113         if (r < 0)
1114                 return r;
1115
1116         if (ret)
1117                 *ret = o;
1118
1119         if (offset)
1120                 *offset = p;
1121
1122         return 1;
1123 }
1124
1125 static int generic_array_get_plus_one(JournalFile *f,
1126                                       uint64_t extra,
1127                                       uint64_t first,
1128                                       uint64_t i,
1129                                       Object **ret, uint64_t *offset) {
1130
1131         Object *o;
1132
1133         assert(f);
1134
1135         if (i == 0) {
1136                 int r;
1137
1138                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1139                 if (r < 0)
1140                         return r;
1141
1142                 if (ret)
1143                         *ret = o;
1144
1145                 if (offset)
1146                         *offset = extra;
1147
1148                 return 1;
1149         }
1150
1151         return generic_array_get(f, first, i-1, ret, offset);
1152 }
1153
1154 enum {
1155         TEST_FOUND,
1156         TEST_LEFT,
1157         TEST_RIGHT
1158 };
1159
1160 static int generic_array_bisect(JournalFile *f,
1161                                 uint64_t first,
1162                                 uint64_t n,
1163                                 uint64_t needle,
1164                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1165                                 direction_t direction,
1166                                 Object **ret,
1167                                 uint64_t *offset,
1168                                 uint64_t *idx) {
1169
1170         uint64_t a, p, t = 0, i = 0, last_p = 0;
1171         bool subtract_one = false;
1172         Object *o, *array = NULL;
1173         int r;
1174
1175         assert(f);
1176         assert(test_object);
1177
1178         a = first;
1179         while (a > 0) {
1180                 uint64_t left, right, k, lp;
1181
1182                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1183                 if (r < 0)
1184                         return r;
1185
1186                 k = journal_file_entry_array_n_items(array);
1187                 right = MIN(k, n);
1188                 if (right <= 0)
1189                         return 0;
1190
1191                 i = right - 1;
1192                 lp = p = le64toh(array->entry_array.items[i]);
1193                 if (p <= 0)
1194                         return -EBADMSG;
1195
1196                 r = test_object(f, p, needle);
1197                 if (r < 0)
1198                         return r;
1199
1200                 if (r == TEST_FOUND)
1201                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1202
1203                 if (r == TEST_RIGHT) {
1204                         left = 0;
1205                         right -= 1;
1206                         for (;;) {
1207                                 if (left == right) {
1208                                         if (direction == DIRECTION_UP)
1209                                                 subtract_one = true;
1210
1211                                         i = left;
1212                                         goto found;
1213                                 }
1214
1215                                 assert(left < right);
1216
1217                                 i = (left + right) / 2;
1218                                 p = le64toh(array->entry_array.items[i]);
1219                                 if (p <= 0)
1220                                         return -EBADMSG;
1221
1222                                 r = test_object(f, p, needle);
1223                                 if (r < 0)
1224                                         return r;
1225
1226                                 if (r == TEST_FOUND)
1227                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1228
1229                                 if (r == TEST_RIGHT)
1230                                         right = i;
1231                                 else
1232                                         left = i + 1;
1233                         }
1234                 }
1235
1236                 if (k > n) {
1237                         if (direction == DIRECTION_UP) {
1238                                 i = n;
1239                                 subtract_one = true;
1240                                 goto found;
1241                         }
1242
1243                         return 0;
1244                 }
1245
1246                 last_p = lp;
1247
1248                 n -= k;
1249                 t += k;
1250                 a = le64toh(array->entry_array.next_entry_array_offset);
1251         }
1252
1253         return 0;
1254
1255 found:
1256         if (subtract_one && t == 0 && i == 0)
1257                 return 0;
1258
1259         if (subtract_one && i == 0)
1260                 p = last_p;
1261         else if (subtract_one)
1262                 p = le64toh(array->entry_array.items[i-1]);
1263         else
1264                 p = le64toh(array->entry_array.items[i]);
1265
1266         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1267         if (r < 0)
1268                 return r;
1269
1270         if (ret)
1271                 *ret = o;
1272
1273         if (offset)
1274                 *offset = p;
1275
1276         if (idx)
1277                 *idx = t + i + (subtract_one ? -1 : 0);
1278
1279         return 1;
1280 }
1281
1282 static int generic_array_bisect_plus_one(JournalFile *f,
1283                                          uint64_t extra,
1284                                          uint64_t first,
1285                                          uint64_t n,
1286                                          uint64_t needle,
1287                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1288                                          direction_t direction,
1289                                          Object **ret,
1290                                          uint64_t *offset,
1291                                          uint64_t *idx) {
1292
1293         int r;
1294         bool step_back = false;
1295         Object *o;
1296
1297         assert(f);
1298         assert(test_object);
1299
1300         if (n <= 0)
1301                 return 0;
1302
1303         /* This bisects the array in object 'first', but first checks
1304          * an extra  */
1305         r = test_object(f, extra, needle);
1306         if (r < 0)
1307                 return r;
1308
1309         if (r == TEST_FOUND)
1310                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1311
1312         /* if we are looking with DIRECTION_UP then we need to first
1313            see if in the actual array there is a matching entry, and
1314            return the last one of that. But if there isn't any we need
1315            to return this one. Hence remember this, and return it
1316            below. */
1317         if (r == TEST_LEFT)
1318                 step_back = direction == DIRECTION_UP;
1319
1320         if (r == TEST_RIGHT) {
1321                 if (direction == DIRECTION_DOWN)
1322                         goto found;
1323                 else
1324                         return 0;
1325         }
1326
1327         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1328
1329         if (r == 0 && step_back)
1330                 goto found;
1331
1332         if (r > 0 && idx)
1333                 (*idx) ++;
1334
1335         return r;
1336
1337 found:
1338         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1339         if (r < 0)
1340                 return r;
1341
1342         if (ret)
1343                 *ret = o;
1344
1345         if (offset)
1346                 *offset = extra;
1347
1348         if (idx)
1349                 *idx = 0;
1350
1351         return 1;
1352 }
1353
1354 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1355         assert(f);
1356         assert(p > 0);
1357
1358         if (p == needle)
1359                 return TEST_FOUND;
1360         else if (p < needle)
1361                 return TEST_LEFT;
1362         else
1363                 return TEST_RIGHT;
1364 }
1365
1366 int journal_file_move_to_entry_by_offset(
1367                 JournalFile *f,
1368                 uint64_t p,
1369                 direction_t direction,
1370                 Object **ret,
1371                 uint64_t *offset) {
1372
1373         return generic_array_bisect(f,
1374                                     le64toh(f->header->entry_array_offset),
1375                                     le64toh(f->header->n_entries),
1376                                     p,
1377                                     test_object_offset,
1378                                     direction,
1379                                     ret, offset, NULL);
1380 }
1381
1382
1383 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1384         Object *o;
1385         int r;
1386
1387         assert(f);
1388         assert(p > 0);
1389
1390         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1391         if (r < 0)
1392                 return r;
1393
1394         if (le64toh(o->entry.seqnum) == needle)
1395                 return TEST_FOUND;
1396         else if (le64toh(o->entry.seqnum) < needle)
1397                 return TEST_LEFT;
1398         else
1399                 return TEST_RIGHT;
1400 }
1401
1402 int journal_file_move_to_entry_by_seqnum(
1403                 JournalFile *f,
1404                 uint64_t seqnum,
1405                 direction_t direction,
1406                 Object **ret,
1407                 uint64_t *offset) {
1408
1409         return generic_array_bisect(f,
1410                                     le64toh(f->header->entry_array_offset),
1411                                     le64toh(f->header->n_entries),
1412                                     seqnum,
1413                                     test_object_seqnum,
1414                                     direction,
1415                                     ret, offset, NULL);
1416 }
1417
1418 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1419         Object *o;
1420         int r;
1421
1422         assert(f);
1423         assert(p > 0);
1424
1425         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1426         if (r < 0)
1427                 return r;
1428
1429         if (le64toh(o->entry.realtime) == needle)
1430                 return TEST_FOUND;
1431         else if (le64toh(o->entry.realtime) < needle)
1432                 return TEST_LEFT;
1433         else
1434                 return TEST_RIGHT;
1435 }
1436
1437 int journal_file_move_to_entry_by_realtime(
1438                 JournalFile *f,
1439                 uint64_t realtime,
1440                 direction_t direction,
1441                 Object **ret,
1442                 uint64_t *offset) {
1443
1444         return generic_array_bisect(f,
1445                                     le64toh(f->header->entry_array_offset),
1446                                     le64toh(f->header->n_entries),
1447                                     realtime,
1448                                     test_object_realtime,
1449                                     direction,
1450                                     ret, offset, NULL);
1451 }
1452
1453 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1454         Object *o;
1455         int r;
1456
1457         assert(f);
1458         assert(p > 0);
1459
1460         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1461         if (r < 0)
1462                 return r;
1463
1464         if (le64toh(o->entry.monotonic) == needle)
1465                 return TEST_FOUND;
1466         else if (le64toh(o->entry.monotonic) < needle)
1467                 return TEST_LEFT;
1468         else
1469                 return TEST_RIGHT;
1470 }
1471
1472 int journal_file_move_to_entry_by_monotonic(
1473                 JournalFile *f,
1474                 sd_id128_t boot_id,
1475                 uint64_t monotonic,
1476                 direction_t direction,
1477                 Object **ret,
1478                 uint64_t *offset) {
1479
1480         char t[9+32+1] = "_BOOT_ID=";
1481         Object *o;
1482         int r;
1483
1484         assert(f);
1485
1486         sd_id128_to_string(boot_id, t + 9);
1487         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1488         if (r < 0)
1489                 return r;
1490         if (r == 0)
1491                 return -ENOENT;
1492
1493         return generic_array_bisect_plus_one(f,
1494                                              le64toh(o->data.entry_offset),
1495                                              le64toh(o->data.entry_array_offset),
1496                                              le64toh(o->data.n_entries),
1497                                              monotonic,
1498                                              test_object_monotonic,
1499                                              direction,
1500                                              ret, offset, NULL);
1501 }
1502
1503 int journal_file_next_entry(
1504                 JournalFile *f,
1505                 Object *o, uint64_t p,
1506                 direction_t direction,
1507                 Object **ret, uint64_t *offset) {
1508
1509         uint64_t i, n;
1510         int r;
1511
1512         assert(f);
1513         assert(p > 0 || !o);
1514
1515         n = le64toh(f->header->n_entries);
1516         if (n <= 0)
1517                 return 0;
1518
1519         if (!o)
1520                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1521         else {
1522                 if (o->object.type != OBJECT_ENTRY)
1523                         return -EINVAL;
1524
1525                 r = generic_array_bisect(f,
1526                                          le64toh(f->header->entry_array_offset),
1527                                          le64toh(f->header->n_entries),
1528                                          p,
1529                                          test_object_offset,
1530                                          DIRECTION_DOWN,
1531                                          NULL, NULL,
1532                                          &i);
1533                 if (r <= 0)
1534                         return r;
1535
1536                 if (direction == DIRECTION_DOWN) {
1537                         if (i >= n - 1)
1538                                 return 0;
1539
1540                         i++;
1541                 } else {
1542                         if (i <= 0)
1543                                 return 0;
1544
1545                         i--;
1546                 }
1547         }
1548
1549         /* And jump to it */
1550         return generic_array_get(f,
1551                                  le64toh(f->header->entry_array_offset),
1552                                  i,
1553                                  ret, offset);
1554 }
1555
1556 int journal_file_skip_entry(
1557                 JournalFile *f,
1558                 Object *o, uint64_t p,
1559                 int64_t skip,
1560                 Object **ret, uint64_t *offset) {
1561
1562         uint64_t i, n;
1563         int r;
1564
1565         assert(f);
1566         assert(o);
1567         assert(p > 0);
1568
1569         if (o->object.type != OBJECT_ENTRY)
1570                 return -EINVAL;
1571
1572         r = generic_array_bisect(f,
1573                                  le64toh(f->header->entry_array_offset),
1574                                  le64toh(f->header->n_entries),
1575                                  p,
1576                                  test_object_offset,
1577                                  DIRECTION_DOWN,
1578                                  NULL, NULL,
1579                                  &i);
1580         if (r <= 0)
1581                 return r;
1582
1583         /* Calculate new index */
1584         if (skip < 0) {
1585                 if ((uint64_t) -skip >= i)
1586                         i = 0;
1587                 else
1588                         i = i - (uint64_t) -skip;
1589         } else
1590                 i  += (uint64_t) skip;
1591
1592         n = le64toh(f->header->n_entries);
1593         if (n <= 0)
1594                 return -EBADMSG;
1595
1596         if (i >= n)
1597                 i = n-1;
1598
1599         return generic_array_get(f,
1600                                  le64toh(f->header->entry_array_offset),
1601                                  i,
1602                                  ret, offset);
1603 }
1604
1605 int journal_file_next_entry_for_data(
1606                 JournalFile *f,
1607                 Object *o, uint64_t p,
1608                 uint64_t data_offset,
1609                 direction_t direction,
1610                 Object **ret, uint64_t *offset) {
1611
1612         uint64_t n, i;
1613         int r;
1614         Object *d;
1615
1616         assert(f);
1617         assert(p > 0 || !o);
1618
1619         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1620         if (r < 0)
1621                 return r;
1622
1623         n = le64toh(d->data.n_entries);
1624         if (n <= 0)
1625                 return n;
1626
1627         if (!o)
1628                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1629         else {
1630                 if (o->object.type != OBJECT_ENTRY)
1631                         return -EINVAL;
1632
1633                 r = generic_array_bisect_plus_one(f,
1634                                                   le64toh(d->data.entry_offset),
1635                                                   le64toh(d->data.entry_array_offset),
1636                                                   le64toh(d->data.n_entries),
1637                                                   p,
1638                                                   test_object_offset,
1639                                                   DIRECTION_DOWN,
1640                                                   NULL, NULL,
1641                                                   &i);
1642
1643                 if (r <= 0)
1644                         return r;
1645
1646                 if (direction == DIRECTION_DOWN) {
1647                         if (i >= n - 1)
1648                                 return 0;
1649
1650                         i++;
1651                 } else {
1652                         if (i <= 0)
1653                                 return 0;
1654
1655                         i--;
1656                 }
1657
1658         }
1659
1660         return generic_array_get_plus_one(f,
1661                                           le64toh(d->data.entry_offset),
1662                                           le64toh(d->data.entry_array_offset),
1663                                           i,
1664                                           ret, offset);
1665 }
1666
1667 int journal_file_move_to_entry_by_offset_for_data(
1668                 JournalFile *f,
1669                 uint64_t data_offset,
1670                 uint64_t p,
1671                 direction_t direction,
1672                 Object **ret, uint64_t *offset) {
1673
1674         int r;
1675         Object *d;
1676
1677         assert(f);
1678
1679         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1680         if (r < 0)
1681                 return r;
1682
1683         return generic_array_bisect_plus_one(f,
1684                                              le64toh(d->data.entry_offset),
1685                                              le64toh(d->data.entry_array_offset),
1686                                              le64toh(d->data.n_entries),
1687                                              p,
1688                                              test_object_offset,
1689                                              direction,
1690                                              ret, offset, NULL);
1691 }
1692
1693 int journal_file_move_to_entry_by_monotonic_for_data(
1694                 JournalFile *f,
1695                 uint64_t data_offset,
1696                 sd_id128_t boot_id,
1697                 uint64_t monotonic,
1698                 direction_t direction,
1699                 Object **ret, uint64_t *offset) {
1700
1701         char t[9+32+1] = "_BOOT_ID=";
1702         Object *o, *d;
1703         int r;
1704         uint64_t b, z;
1705
1706         assert(f);
1707
1708         /* First, seek by time */
1709         sd_id128_to_string(boot_id, t + 9);
1710         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1711         if (r < 0)
1712                 return r;
1713         if (r == 0)
1714                 return -ENOENT;
1715
1716         r = generic_array_bisect_plus_one(f,
1717                                           le64toh(o->data.entry_offset),
1718                                           le64toh(o->data.entry_array_offset),
1719                                           le64toh(o->data.n_entries),
1720                                           monotonic,
1721                                           test_object_monotonic,
1722                                           direction,
1723                                           NULL, &z, NULL);
1724         if (r <= 0)
1725                 return r;
1726
1727         /* And now, continue seeking until we find an entry that
1728          * exists in both bisection arrays */
1729
1730         for (;;) {
1731                 Object *qo;
1732                 uint64_t p, q;
1733
1734                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1735                 if (r < 0)
1736                         return r;
1737
1738                 r = generic_array_bisect_plus_one(f,
1739                                                   le64toh(d->data.entry_offset),
1740                                                   le64toh(d->data.entry_array_offset),
1741                                                   le64toh(d->data.n_entries),
1742                                                   z,
1743                                                   test_object_offset,
1744                                                   direction,
1745                                                   NULL, &p, NULL);
1746                 if (r <= 0)
1747                         return r;
1748
1749                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1750                 if (r < 0)
1751                         return r;
1752
1753                 r = generic_array_bisect_plus_one(f,
1754                                                   le64toh(o->data.entry_offset),
1755                                                   le64toh(o->data.entry_array_offset),
1756                                                   le64toh(o->data.n_entries),
1757                                                   p,
1758                                                   test_object_offset,
1759                                                   direction,
1760                                                   &qo, &q, NULL);
1761
1762                 if (r <= 0)
1763                         return r;
1764
1765                 if (p == q) {
1766                         if (ret)
1767                                 *ret = qo;
1768                         if (offset)
1769                                 *offset = q;
1770
1771                         return 1;
1772                 }
1773
1774                 z = q;
1775         }
1776
1777         return 0;
1778 }
1779
1780 int journal_file_move_to_entry_by_seqnum_for_data(
1781                 JournalFile *f,
1782                 uint64_t data_offset,
1783                 uint64_t seqnum,
1784                 direction_t direction,
1785                 Object **ret, uint64_t *offset) {
1786
1787         Object *d;
1788         int r;
1789
1790         assert(f);
1791
1792         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1793         if (r < 0)
1794                 return r;
1795
1796         return generic_array_bisect_plus_one(f,
1797                                              le64toh(d->data.entry_offset),
1798                                              le64toh(d->data.entry_array_offset),
1799                                              le64toh(d->data.n_entries),
1800                                              seqnum,
1801                                              test_object_seqnum,
1802                                              direction,
1803                                              ret, offset, NULL);
1804 }
1805
1806 int journal_file_move_to_entry_by_realtime_for_data(
1807                 JournalFile *f,
1808                 uint64_t data_offset,
1809                 uint64_t realtime,
1810                 direction_t direction,
1811                 Object **ret, uint64_t *offset) {
1812
1813         Object *d;
1814         int r;
1815
1816         assert(f);
1817
1818         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1819         if (r < 0)
1820                 return r;
1821
1822         return generic_array_bisect_plus_one(f,
1823                                              le64toh(d->data.entry_offset),
1824                                              le64toh(d->data.entry_array_offset),
1825                                              le64toh(d->data.n_entries),
1826                                              realtime,
1827                                              test_object_realtime,
1828                                              direction,
1829                                              ret, offset, NULL);
1830 }
1831
1832 void journal_file_dump(JournalFile *f) {
1833         Object *o;
1834         int r;
1835         uint64_t p;
1836
1837         assert(f);
1838
1839         journal_file_print_header(f);
1840
1841         p = le64toh(f->header->header_size);
1842         while (p != 0) {
1843                 r = journal_file_move_to_object(f, -1, p, &o);
1844                 if (r < 0)
1845                         goto fail;
1846
1847                 switch (o->object.type) {
1848
1849                 case OBJECT_UNUSED:
1850                         printf("Type: OBJECT_UNUSED\n");
1851                         break;
1852
1853                 case OBJECT_DATA:
1854                         printf("Type: OBJECT_DATA\n");
1855                         break;
1856
1857                 case OBJECT_ENTRY:
1858                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
1859                                (unsigned long long) le64toh(o->entry.seqnum),
1860                                (unsigned long long) le64toh(o->entry.monotonic),
1861                                (unsigned long long) le64toh(o->entry.realtime));
1862                         break;
1863
1864                 case OBJECT_FIELD_HASH_TABLE:
1865                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1866                         break;
1867
1868                 case OBJECT_DATA_HASH_TABLE:
1869                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1870                         break;
1871
1872                 case OBJECT_ENTRY_ARRAY:
1873                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1874                         break;
1875
1876                 case OBJECT_TAG:
1877                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1878                                (unsigned long long) le64toh(o->tag.seqnum),
1879                                (unsigned long long) le64toh(o->tag.epoch));
1880                         break;
1881                 }
1882
1883                 if (o->object.flags & OBJECT_COMPRESSED)
1884                         printf("Flags: COMPRESSED\n");
1885
1886                 if (p == le64toh(f->header->tail_object_offset))
1887                         p = 0;
1888                 else
1889                         p = p + ALIGN64(le64toh(o->object.size));
1890         }
1891
1892         return;
1893 fail:
1894         log_error("File corrupt");
1895 }
1896
1897 void journal_file_print_header(JournalFile *f) {
1898         char a[33], b[33], c[33];
1899         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1900
1901         assert(f);
1902
1903         printf("File Path: %s\n"
1904                "File ID: %s\n"
1905                "Machine ID: %s\n"
1906                "Boot ID: %s\n"
1907                "Sequential Number ID: %s\n"
1908                "State: %s\n"
1909                "Compatible Flags:%s%s\n"
1910                "Incompatible Flags:%s%s\n"
1911                "Header size: %llu\n"
1912                "Arena size: %llu\n"
1913                "Data Hash Table Size: %llu\n"
1914                "Field Hash Table Size: %llu\n"
1915                "Rotate Suggested: %s\n"
1916                "Head Sequential Number: %llu\n"
1917                "Tail Sequential Number: %llu\n"
1918                "Head Realtime Timestamp: %s\n"
1919                "Tail Realtime Timestamp: %s\n"
1920                "Objects: %llu\n"
1921                "Entry Objects: %llu\n",
1922                f->path,
1923                sd_id128_to_string(f->header->file_id, a),
1924                sd_id128_to_string(f->header->machine_id, b),
1925                sd_id128_to_string(f->header->boot_id, c),
1926                sd_id128_to_string(f->header->seqnum_id, c),
1927                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1928                f->header->state == STATE_ONLINE ? "ONLINE" :
1929                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1930                (f->header->compatible_flags & HEADER_COMPATIBLE_SEALED) ? " SEALED" : "",
1931                (f->header->compatible_flags & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1932                (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
1933                (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1934                (unsigned long long) le64toh(f->header->header_size),
1935                (unsigned long long) le64toh(f->header->arena_size),
1936                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1937                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1938                yes_no(journal_file_rotate_suggested(f)),
1939                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1940                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1941                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1942                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1943                (unsigned long long) le64toh(f->header->n_objects),
1944                (unsigned long long) le64toh(f->header->n_entries));
1945
1946         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1947                 printf("Data Objects: %llu\n"
1948                        "Data Hash Table Fill: %.1f%%\n",
1949                        (unsigned long long) le64toh(f->header->n_data),
1950                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1951
1952         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1953                 printf("Field Objects: %llu\n"
1954                        "Field Hash Table Fill: %.1f%%\n",
1955                        (unsigned long long) le64toh(f->header->n_fields),
1956                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1957
1958         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
1959                 printf("Tag Objects: %llu\n",
1960                        (unsigned long long) le64toh(f->header->n_tags));
1961         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1962                 printf("Entry Array Objects: %llu\n",
1963                        (unsigned long long) le64toh(f->header->n_entry_arrays));
1964 }
1965
1966 int journal_file_open(
1967                 const char *fname,
1968                 int flags,
1969                 mode_t mode,
1970                 bool compress,
1971                 bool seal,
1972                 JournalMetrics *metrics,
1973                 MMapCache *mmap_cache,
1974                 JournalFile *template,
1975                 JournalFile **ret) {
1976
1977         JournalFile *f;
1978         int r;
1979         bool newly_created = false;
1980
1981         assert(fname);
1982
1983         if ((flags & O_ACCMODE) != O_RDONLY &&
1984             (flags & O_ACCMODE) != O_RDWR)
1985                 return -EINVAL;
1986
1987         if (!endswith(fname, ".journal") &&
1988             !endswith(fname, ".journal~"))
1989                 return -EINVAL;
1990
1991         f = new0(JournalFile, 1);
1992         if (!f)
1993                 return -ENOMEM;
1994
1995         f->fd = -1;
1996         f->mode = mode;
1997
1998         f->flags = flags;
1999         f->prot = prot_from_flags(flags);
2000         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2001         f->compress = compress;
2002         f->seal = seal;
2003
2004         if (mmap_cache)
2005                 f->mmap = mmap_cache_ref(mmap_cache);
2006         else {
2007                 f->mmap = mmap_cache_new();
2008                 if (!f->mmap) {
2009                         r = -ENOMEM;
2010                         goto fail;
2011                 }
2012         }
2013
2014         f->path = strdup(fname);
2015         if (!f->path) {
2016                 r = -ENOMEM;
2017                 goto fail;
2018         }
2019
2020         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2021         if (f->fd < 0) {
2022                 r = -errno;
2023                 goto fail;
2024         }
2025
2026         if (fstat(f->fd, &f->last_stat) < 0) {
2027                 r = -errno;
2028                 goto fail;
2029         }
2030
2031         if (f->last_stat.st_size == 0 && f->writable) {
2032                 newly_created = true;
2033
2034                 /* Try to load the FSPRG state, and if we can't, then
2035                  * just don't do sealing */
2036                 r = journal_file_fss_load(f);
2037                 if (r < 0)
2038                         f->seal = false;
2039
2040                 r = journal_file_init_header(f, template);
2041                 if (r < 0)
2042                         goto fail;
2043
2044                 if (fstat(f->fd, &f->last_stat) < 0) {
2045                         r = -errno;
2046                         goto fail;
2047                 }
2048         }
2049
2050         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2051                 r = -EIO;
2052                 goto fail;
2053         }
2054
2055         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2056         if (f->header == MAP_FAILED) {
2057                 f->header = NULL;
2058                 r = -errno;
2059                 goto fail;
2060         }
2061
2062         if (!newly_created) {
2063                 r = journal_file_verify_header(f);
2064                 if (r < 0)
2065                         goto fail;
2066         }
2067
2068         if (!newly_created && f->writable) {
2069                 r = journal_file_fss_load(f);
2070                 if (r < 0)
2071                         goto fail;
2072         }
2073
2074         if (f->writable) {
2075                 if (metrics) {
2076                         journal_default_metrics(metrics, f->fd);
2077                         f->metrics = *metrics;
2078                 } else if (template)
2079                         f->metrics = template->metrics;
2080
2081                 r = journal_file_refresh_header(f);
2082                 if (r < 0)
2083                         goto fail;
2084         }
2085
2086         r = journal_file_hmac_setup(f);
2087         if (r < 0)
2088                 goto fail;
2089
2090         if (newly_created) {
2091                 r = journal_file_setup_field_hash_table(f);
2092                 if (r < 0)
2093                         goto fail;
2094
2095                 r = journal_file_setup_data_hash_table(f);
2096                 if (r < 0)
2097                         goto fail;
2098
2099                 r = journal_file_append_first_tag(f);
2100                 if (r < 0)
2101                         goto fail;
2102         }
2103
2104         r = journal_file_map_field_hash_table(f);
2105         if (r < 0)
2106                 goto fail;
2107
2108         r = journal_file_map_data_hash_table(f);
2109         if (r < 0)
2110                 goto fail;
2111
2112         if (ret)
2113                 *ret = f;
2114
2115         return 0;
2116
2117 fail:
2118         journal_file_close(f);
2119
2120         return r;
2121 }
2122
2123 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2124         char *p;
2125         size_t l;
2126         JournalFile *old_file, *new_file = NULL;
2127         int r;
2128
2129         assert(f);
2130         assert(*f);
2131
2132         old_file = *f;
2133
2134         if (!old_file->writable)
2135                 return -EINVAL;
2136
2137         if (!endswith(old_file->path, ".journal"))
2138                 return -EINVAL;
2139
2140         l = strlen(old_file->path);
2141
2142         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2143         if (!p)
2144                 return -ENOMEM;
2145
2146         memcpy(p, old_file->path, l - 8);
2147         p[l-8] = '@';
2148         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2149         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2150                  "-%016llx-%016llx.journal",
2151                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2152                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2153
2154         r = rename(old_file->path, p);
2155         free(p);
2156
2157         if (r < 0)
2158                 return -errno;
2159
2160         old_file->header->state = STATE_ARCHIVED;
2161
2162         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2163         journal_file_close(old_file);
2164
2165         *f = new_file;
2166         return r;
2167 }
2168
2169 int journal_file_open_reliably(
2170                 const char *fname,
2171                 int flags,
2172                 mode_t mode,
2173                 bool compress,
2174                 bool seal,
2175                 JournalMetrics *metrics,
2176                 MMapCache *mmap_cache,
2177                 JournalFile *template,
2178                 JournalFile **ret) {
2179
2180         int r;
2181         size_t l;
2182         char *p;
2183
2184         r = journal_file_open(fname, flags, mode, compress, seal,
2185                               metrics, mmap_cache, template, ret);
2186         if (r != -EBADMSG && /* corrupted */
2187             r != -ENODATA && /* truncated */
2188             r != -EHOSTDOWN && /* other machine */
2189             r != -EPROTONOSUPPORT && /* incompatible feature */
2190             r != -EBUSY && /* unclean shutdown */
2191             r != -ESHUTDOWN /* already archived */)
2192                 return r;
2193
2194         if ((flags & O_ACCMODE) == O_RDONLY)
2195                 return r;
2196
2197         if (!(flags & O_CREAT))
2198                 return r;
2199
2200         if (!endswith(fname, ".journal"))
2201                 return r;
2202
2203         /* The file is corrupted. Rotate it away and try it again (but only once) */
2204
2205         l = strlen(fname);
2206         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2207                      (int) (l-8), fname,
2208                      (unsigned long long) now(CLOCK_REALTIME),
2209                      random_ull()) < 0)
2210                 return -ENOMEM;
2211
2212         r = rename(fname, p);
2213         free(p);
2214         if (r < 0)
2215                 return -errno;
2216
2217         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2218
2219         return journal_file_open(fname, flags, mode, compress, seal,
2220                                  metrics, mmap_cache, template, ret);
2221 }
2222
2223
2224 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2225         uint64_t i, n;
2226         uint64_t q, xor_hash = 0;
2227         int r;
2228         EntryItem *items;
2229         dual_timestamp ts;
2230
2231         assert(from);
2232         assert(to);
2233         assert(o);
2234         assert(p);
2235
2236         if (!to->writable)
2237                 return -EPERM;
2238
2239         ts.monotonic = le64toh(o->entry.monotonic);
2240         ts.realtime = le64toh(o->entry.realtime);
2241
2242         if (to->tail_entry_monotonic_valid &&
2243             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2244                 return -EINVAL;
2245
2246         n = journal_file_entry_n_items(o);
2247         items = alloca(sizeof(EntryItem) * n);
2248
2249         for (i = 0; i < n; i++) {
2250                 uint64_t l, h;
2251                 le64_t le_hash;
2252                 size_t t;
2253                 void *data;
2254                 Object *u;
2255
2256                 q = le64toh(o->entry.items[i].object_offset);
2257                 le_hash = o->entry.items[i].hash;
2258
2259                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2260                 if (r < 0)
2261                         return r;
2262
2263                 if (le_hash != o->data.hash)
2264                         return -EBADMSG;
2265
2266                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2267                 t = (size_t) l;
2268
2269                 /* We hit the limit on 32bit machines */
2270                 if ((uint64_t) t != l)
2271                         return -E2BIG;
2272
2273                 if (o->object.flags & OBJECT_COMPRESSED) {
2274 #ifdef HAVE_XZ
2275                         uint64_t rsize;
2276
2277                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2278                                 return -EBADMSG;
2279
2280                         data = from->compress_buffer;
2281                         l = rsize;
2282 #else
2283                         return -EPROTONOSUPPORT;
2284 #endif
2285                 } else
2286                         data = o->data.payload;
2287
2288                 r = journal_file_append_data(to, data, l, &u, &h);
2289                 if (r < 0)
2290                         return r;
2291
2292                 xor_hash ^= le64toh(u->data.hash);
2293                 items[i].object_offset = htole64(h);
2294                 items[i].hash = u->data.hash;
2295
2296                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2297                 if (r < 0)
2298                         return r;
2299         }
2300
2301         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2302 }
2303
2304 void journal_default_metrics(JournalMetrics *m, int fd) {
2305         uint64_t fs_size = 0;
2306         struct statvfs ss;
2307         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2308
2309         assert(m);
2310         assert(fd >= 0);
2311
2312         if (fstatvfs(fd, &ss) >= 0)
2313                 fs_size = ss.f_frsize * ss.f_blocks;
2314
2315         if (m->max_use == (uint64_t) -1) {
2316
2317                 if (fs_size > 0) {
2318                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2319
2320                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2321                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2322
2323                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2324                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2325                 } else
2326                         m->max_use = DEFAULT_MAX_USE_LOWER;
2327         } else {
2328                 m->max_use = PAGE_ALIGN(m->max_use);
2329
2330                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2331                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2332         }
2333
2334         if (m->max_size == (uint64_t) -1) {
2335                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2336
2337                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2338                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2339         } else
2340                 m->max_size = PAGE_ALIGN(m->max_size);
2341
2342         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2343                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2344
2345         if (m->max_size*2 > m->max_use)
2346                 m->max_use = m->max_size*2;
2347
2348         if (m->min_size == (uint64_t) -1)
2349                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2350         else {
2351                 m->min_size = PAGE_ALIGN(m->min_size);
2352
2353                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2354                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2355
2356                 if (m->min_size > m->max_size)
2357                         m->max_size = m->min_size;
2358         }
2359
2360         if (m->keep_free == (uint64_t) -1) {
2361
2362                 if (fs_size > 0) {
2363                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2364
2365                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2366                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2367
2368                 } else
2369                         m->keep_free = DEFAULT_KEEP_FREE;
2370         }
2371
2372         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2373                  format_bytes(a, sizeof(a), m->max_use),
2374                  format_bytes(b, sizeof(b), m->max_size),
2375                  format_bytes(c, sizeof(c), m->min_size),
2376                  format_bytes(d, sizeof(d), m->keep_free));
2377 }
2378
2379 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2380         assert(f);
2381         assert(from || to);
2382
2383         if (from) {
2384                 if (f->header->head_entry_realtime == 0)
2385                         return -ENOENT;
2386
2387                 *from = le64toh(f->header->head_entry_realtime);
2388         }
2389
2390         if (to) {
2391                 if (f->header->tail_entry_realtime == 0)
2392                         return -ENOENT;
2393
2394                 *to = le64toh(f->header->tail_entry_realtime);
2395         }
2396
2397         return 1;
2398 }
2399
2400 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2401         char t[9+32+1] = "_BOOT_ID=";
2402         Object *o;
2403         uint64_t p;
2404         int r;
2405
2406         assert(f);
2407         assert(from || to);
2408
2409         sd_id128_to_string(boot_id, t + 9);
2410
2411         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2412         if (r <= 0)
2413                 return r;
2414
2415         if (le64toh(o->data.n_entries) <= 0)
2416                 return 0;
2417
2418         if (from) {
2419                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2420                 if (r < 0)
2421                         return r;
2422
2423                 *from = le64toh(o->entry.monotonic);
2424         }
2425
2426         if (to) {
2427                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2428                 if (r < 0)
2429                         return r;
2430
2431                 r = generic_array_get_plus_one(f,
2432                                                le64toh(o->data.entry_offset),
2433                                                le64toh(o->data.entry_array_offset),
2434                                                le64toh(o->data.n_entries)-1,
2435                                                &o, NULL);
2436                 if (r <= 0)
2437                         return r;
2438
2439                 *to = le64toh(o->entry.monotonic);
2440         }
2441
2442         return 1;
2443 }
2444
2445 bool journal_file_rotate_suggested(JournalFile *f) {
2446         assert(f);
2447
2448         /* If we gained new header fields we gained new features,
2449          * hence suggest a rotation */
2450         if (le64toh(f->header->header_size) < sizeof(Header)) {
2451                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2452                 return true;
2453         }
2454
2455         /* Let's check if the hash tables grew over a certain fill
2456          * level (75%, borrowing this value from Java's hash table
2457          * implementation), and if so suggest a rotation. To calculate
2458          * the fill level we need the n_data field, which only exists
2459          * in newer versions. */
2460
2461         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2462                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2463                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2464                                   f->path,
2465                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2466                                   (unsigned long long) le64toh(f->header->n_data),
2467                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2468                                   (unsigned long long) (f->last_stat.st_size),
2469                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2470                         return true;
2471                 }
2472
2473         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2474                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2475                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2476                                   f->path,
2477                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2478                                   (unsigned long long) le64toh(f->header->n_fields),
2479                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2480                         return true;
2481                 }
2482
2483         return false;
2484 }