chiark / gitweb /
760efaebbb269e3e0b0721ea470f847b60ef4b92
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 void journal_file_close(JournalFile *f) {
65         assert(f);
66
67         /* Write the final tag */
68         if (f->seal && f->writable)
69                 journal_file_append_tag(f);
70
71         /* Sync everything to disk, before we mark the file offline */
72         if (f->mmap && f->fd >= 0)
73                 mmap_cache_close_fd(f->mmap, f->fd);
74
75         if (f->writable && f->fd >= 0)
76                 fdatasync(f->fd);
77
78         if (f->header) {
79                 /* Mark the file offline. Don't override the archived state if it already is set */
80                 if (f->writable && f->header->state == STATE_ONLINE)
81                         f->header->state = STATE_OFFLINE;
82
83                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
84         }
85
86         if (f->fd >= 0)
87                 close_nointr_nofail(f->fd);
88
89         free(f->path);
90
91         if (f->mmap)
92                 mmap_cache_unref(f->mmap);
93
94 #ifdef HAVE_XZ
95         free(f->compress_buffer);
96 #endif
97
98 #ifdef HAVE_GCRYPT
99         if (f->fss_file)
100                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
101         else if (f->fsprg_state)
102                 free(f->fsprg_state);
103
104         free(f->fsprg_seed);
105
106         if (f->hmac)
107                 gcry_md_close(f->hmac);
108 #endif
109
110         free(f);
111 }
112
113 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
114         Header h;
115         ssize_t k;
116         int r;
117
118         assert(f);
119
120         zero(h);
121         memcpy(h.signature, HEADER_SIGNATURE, 8);
122         h.header_size = htole64(ALIGN64(sizeof(h)));
123
124         h.incompatible_flags =
125                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
126
127         h.compatible_flags =
128                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
129
130         r = sd_id128_randomize(&h.file_id);
131         if (r < 0)
132                 return r;
133
134         if (template) {
135                 h.seqnum_id = template->header->seqnum_id;
136                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
137         } else
138                 h.seqnum_id = h.file_id;
139
140         k = pwrite(f->fd, &h, sizeof(h), 0);
141         if (k < 0)
142                 return -errno;
143
144         if (k != sizeof(h))
145                 return -EIO;
146
147         return 0;
148 }
149
150 static int journal_file_refresh_header(JournalFile *f) {
151         int r;
152         sd_id128_t boot_id;
153
154         assert(f);
155
156         r = sd_id128_get_machine(&f->header->machine_id);
157         if (r < 0)
158                 return r;
159
160         r = sd_id128_get_boot(&boot_id);
161         if (r < 0)
162                 return r;
163
164         if (sd_id128_equal(boot_id, f->header->boot_id))
165                 f->tail_entry_monotonic_valid = true;
166
167         f->header->boot_id = boot_id;
168
169         f->header->state = STATE_ONLINE;
170
171         /* Sync the online state to disk */
172         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
173         fdatasync(f->fd);
174
175         return 0;
176 }
177
178 static int journal_file_verify_header(JournalFile *f) {
179         assert(f);
180
181         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
182                 return -EBADMSG;
183
184         /* In both read and write mode we refuse to open files with
185          * incompatible flags we don't know */
186 #ifdef HAVE_XZ
187         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
188                 return -EPROTONOSUPPORT;
189 #else
190         if (f->header->incompatible_flags != 0)
191                 return -EPROTONOSUPPORT;
192 #endif
193
194         /* When open for writing we refuse to open files with
195          * compatible flags, too */
196         if (f->writable) {
197 #ifdef HAVE_GCRYPT
198                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
199                         return -EPROTONOSUPPORT;
200 #else
201                 if (f->header->compatible_flags != 0)
202                         return -EPROTONOSUPPORT;
203 #endif
204         }
205
206         if (f->header->state >= _STATE_MAX)
207                 return -EBADMSG;
208
209         /* The first addition was n_data, so check that we are at least this large */
210         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
211                 return -EBADMSG;
212
213         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
214                 return -EBADMSG;
215
216         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
217                 return -ENODATA;
218
219         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
220                 return -ENODATA;
221
222         if (!VALID64(f->header->data_hash_table_offset) ||
223             !VALID64(f->header->field_hash_table_offset) ||
224             !VALID64(f->header->tail_object_offset) ||
225             !VALID64(f->header->entry_array_offset))
226                 return -ENODATA;
227
228         if (f->writable) {
229                 uint8_t state;
230                 sd_id128_t machine_id;
231                 int r;
232
233                 r = sd_id128_get_machine(&machine_id);
234                 if (r < 0)
235                         return r;
236
237                 if (!sd_id128_equal(machine_id, f->header->machine_id))
238                         return -EHOSTDOWN;
239
240                 state = f->header->state;
241
242                 if (state == STATE_ONLINE) {
243                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
244                         return -EBUSY;
245                 } else if (state == STATE_ARCHIVED)
246                         return -ESHUTDOWN;
247                 else if (state != STATE_OFFLINE) {
248                         log_debug("Journal file %s has unknown state %u.", f->path, state);
249                         return -EBUSY;
250                 }
251         }
252
253         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
254
255         if (f->writable)
256                 f->seal = JOURNAL_HEADER_SEALED(f->header);
257
258         return 0;
259 }
260
261 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
262         uint64_t old_size, new_size;
263         int r;
264
265         assert(f);
266
267         /* We assume that this file is not sparse, and we know that
268          * for sure, since we always call posix_fallocate()
269          * ourselves */
270
271         old_size =
272                 le64toh(f->header->header_size) +
273                 le64toh(f->header->arena_size);
274
275         new_size = PAGE_ALIGN(offset + size);
276         if (new_size < le64toh(f->header->header_size))
277                 new_size = le64toh(f->header->header_size);
278
279         if (new_size <= old_size)
280                 return 0;
281
282         if (f->metrics.max_size > 0 &&
283             new_size > f->metrics.max_size)
284                 return -E2BIG;
285
286         if (new_size > f->metrics.min_size &&
287             f->metrics.keep_free > 0) {
288                 struct statvfs svfs;
289
290                 if (fstatvfs(f->fd, &svfs) >= 0) {
291                         uint64_t available;
292
293                         available = svfs.f_bfree * svfs.f_bsize;
294
295                         if (available >= f->metrics.keep_free)
296                                 available -= f->metrics.keep_free;
297                         else
298                                 available = 0;
299
300                         if (new_size - old_size > available)
301                                 return -E2BIG;
302                 }
303         }
304
305         /* Note that the glibc fallocate() fallback is very
306            inefficient, hence we try to minimize the allocation area
307            as we can. */
308         r = posix_fallocate(f->fd, old_size, new_size - old_size);
309         if (r != 0)
310                 return -r;
311
312         mmap_cache_close_fd_range(f->mmap, f->fd, old_size);
313
314         if (fstat(f->fd, &f->last_stat) < 0)
315                 return -errno;
316
317         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
318
319         return 0;
320 }
321
322 static int journal_file_move_to(JournalFile *f, int context, uint64_t offset, uint64_t size, void **ret) {
323         assert(f);
324         assert(ret);
325
326         /* Avoid SIGBUS on invalid accesses */
327         if (offset + size > (uint64_t) f->last_stat.st_size) {
328                 /* Hmm, out of range? Let's refresh the fstat() data
329                  * first, before we trust that check. */
330
331                 if (fstat(f->fd, &f->last_stat) < 0 ||
332                     offset + size > (uint64_t) f->last_stat.st_size)
333                         return -EADDRNOTAVAIL;
334         }
335
336         return mmap_cache_get(f->mmap, f->fd, f->prot, context, offset, size, ret);
337 }
338
339 static uint64_t minimum_header_size(Object *o) {
340
341         static uint64_t table[] = {
342                 [OBJECT_DATA] = sizeof(DataObject),
343                 [OBJECT_FIELD] = sizeof(FieldObject),
344                 [OBJECT_ENTRY] = sizeof(EntryObject),
345                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
346                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
347                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
348                 [OBJECT_TAG] = sizeof(TagObject),
349         };
350
351         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
352                 return sizeof(ObjectHeader);
353
354         return table[o->object.type];
355 }
356
357 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
358         int r;
359         void *t;
360         Object *o;
361         uint64_t s;
362         unsigned context;
363
364         assert(f);
365         assert(ret);
366
367         /* Objects may only be located at multiple of 64 bit */
368         if (!VALID64(offset))
369                 return -EFAULT;
370
371         /* One context for each type, plus one catch-all for the rest */
372         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
373
374         r = journal_file_move_to(f, context, offset, sizeof(ObjectHeader), &t);
375         if (r < 0)
376                 return r;
377
378         o = (Object*) t;
379         s = le64toh(o->object.size);
380
381         if (s < sizeof(ObjectHeader))
382                 return -EBADMSG;
383
384         if (o->object.type <= OBJECT_UNUSED)
385                 return -EBADMSG;
386
387         if (s < minimum_header_size(o))
388                 return -EBADMSG;
389
390         if (type >= 0 && o->object.type != type)
391                 return -EBADMSG;
392
393         if (s > sizeof(ObjectHeader)) {
394                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
395                 if (r < 0)
396                         return r;
397
398                 o = (Object*) t;
399         }
400
401         *ret = o;
402         return 0;
403 }
404
405 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
406         uint64_t r;
407
408         assert(f);
409
410         r = le64toh(f->header->tail_entry_seqnum) + 1;
411
412         if (seqnum) {
413                 /* If an external seqnum counter was passed, we update
414                  * both the local and the external one, and set it to
415                  * the maximum of both */
416
417                 if (*seqnum + 1 > r)
418                         r = *seqnum + 1;
419
420                 *seqnum = r;
421         }
422
423         f->header->tail_entry_seqnum = htole64(r);
424
425         if (f->header->head_entry_seqnum == 0)
426                 f->header->head_entry_seqnum = htole64(r);
427
428         return r;
429 }
430
431 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
432         int r;
433         uint64_t p;
434         Object *tail, *o;
435         void *t;
436
437         assert(f);
438         assert(type > 0 && type < _OBJECT_TYPE_MAX);
439         assert(size >= sizeof(ObjectHeader));
440         assert(offset);
441         assert(ret);
442
443         p = le64toh(f->header->tail_object_offset);
444         if (p == 0)
445                 p = le64toh(f->header->header_size);
446         else {
447                 r = journal_file_move_to_object(f, -1, p, &tail);
448                 if (r < 0)
449                         return r;
450
451                 p += ALIGN64(le64toh(tail->object.size));
452         }
453
454         r = journal_file_allocate(f, p, size);
455         if (r < 0)
456                 return r;
457
458         r = journal_file_move_to(f, type, p, size, &t);
459         if (r < 0)
460                 return r;
461
462         o = (Object*) t;
463
464         zero(o->object);
465         o->object.type = type;
466         o->object.size = htole64(size);
467
468         f->header->tail_object_offset = htole64(p);
469         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
470
471         *ret = o;
472         *offset = p;
473
474         return 0;
475 }
476
477 static int journal_file_setup_data_hash_table(JournalFile *f) {
478         uint64_t s, p;
479         Object *o;
480         int r;
481
482         assert(f);
483
484         /* We estimate that we need 1 hash table entry per 768 of
485            journal file and we want to make sure we never get beyond
486            75% fill level. Calculate the hash table size for the
487            maximum file size based on these metrics. */
488
489         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
490         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
491                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
492
493         log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
494
495         r = journal_file_append_object(f,
496                                        OBJECT_DATA_HASH_TABLE,
497                                        offsetof(Object, hash_table.items) + s,
498                                        &o, &p);
499         if (r < 0)
500                 return r;
501
502         memset(o->hash_table.items, 0, s);
503
504         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505         f->header->data_hash_table_size = htole64(s);
506
507         return 0;
508 }
509
510 static int journal_file_setup_field_hash_table(JournalFile *f) {
511         uint64_t s, p;
512         Object *o;
513         int r;
514
515         assert(f);
516
517         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518         r = journal_file_append_object(f,
519                                        OBJECT_FIELD_HASH_TABLE,
520                                        offsetof(Object, hash_table.items) + s,
521                                        &o, &p);
522         if (r < 0)
523                 return r;
524
525         memset(o->hash_table.items, 0, s);
526
527         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528         f->header->field_hash_table_size = htole64(s);
529
530         return 0;
531 }
532
533 static int journal_file_map_data_hash_table(JournalFile *f) {
534         uint64_t s, p;
535         void *t;
536         int r;
537
538         assert(f);
539
540         p = le64toh(f->header->data_hash_table_offset);
541         s = le64toh(f->header->data_hash_table_size);
542
543         r = journal_file_move_to(f,
544                                  OBJECT_DATA_HASH_TABLE,
545                                  p, s,
546                                  &t);
547         if (r < 0)
548                 return r;
549
550         f->data_hash_table = t;
551         return 0;
552 }
553
554 static int journal_file_map_field_hash_table(JournalFile *f) {
555         uint64_t s, p;
556         void *t;
557         int r;
558
559         assert(f);
560
561         p = le64toh(f->header->field_hash_table_offset);
562         s = le64toh(f->header->field_hash_table_size);
563
564         r = journal_file_move_to(f,
565                                  OBJECT_FIELD_HASH_TABLE,
566                                  p, s,
567                                  &t);
568         if (r < 0)
569                 return r;
570
571         f->field_hash_table = t;
572         return 0;
573 }
574
575 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
576         uint64_t p, h;
577         int r;
578
579         assert(f);
580         assert(o);
581         assert(offset > 0);
582         assert(o->object.type == OBJECT_DATA);
583
584         /* This might alter the window we are looking at */
585
586         o->data.next_hash_offset = o->data.next_field_offset = 0;
587         o->data.entry_offset = o->data.entry_array_offset = 0;
588         o->data.n_entries = 0;
589
590         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
591         p = le64toh(f->data_hash_table[h].tail_hash_offset);
592         if (p == 0) {
593                 /* Only entry in the hash table is easy */
594                 f->data_hash_table[h].head_hash_offset = htole64(offset);
595         } else {
596                 /* Move back to the previous data object, to patch in
597                  * pointer */
598
599                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
600                 if (r < 0)
601                         return r;
602
603                 o->data.next_hash_offset = htole64(offset);
604         }
605
606         f->data_hash_table[h].tail_hash_offset = htole64(offset);
607
608         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
609                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
610
611         return 0;
612 }
613
614 int journal_file_find_data_object_with_hash(
615                 JournalFile *f,
616                 const void *data, uint64_t size, uint64_t hash,
617                 Object **ret, uint64_t *offset) {
618
619         uint64_t p, osize, h;
620         int r;
621
622         assert(f);
623         assert(data || size == 0);
624
625         osize = offsetof(Object, data.payload) + size;
626
627         if (f->header->data_hash_table_size == 0)
628                 return -EBADMSG;
629
630         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
631         p = le64toh(f->data_hash_table[h].head_hash_offset);
632
633         while (p > 0) {
634                 Object *o;
635
636                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
637                 if (r < 0)
638                         return r;
639
640                 if (le64toh(o->data.hash) != hash)
641                         goto next;
642
643                 if (o->object.flags & OBJECT_COMPRESSED) {
644 #ifdef HAVE_XZ
645                         uint64_t l, rsize;
646
647                         l = le64toh(o->object.size);
648                         if (l <= offsetof(Object, data.payload))
649                                 return -EBADMSG;
650
651                         l -= offsetof(Object, data.payload);
652
653                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
654                                 return -EBADMSG;
655
656                         if (rsize == size &&
657                             memcmp(f->compress_buffer, data, size) == 0) {
658
659                                 if (ret)
660                                         *ret = o;
661
662                                 if (offset)
663                                         *offset = p;
664
665                                 return 1;
666                         }
667 #else
668                         return -EPROTONOSUPPORT;
669 #endif
670
671                 } else if (le64toh(o->object.size) == osize &&
672                            memcmp(o->data.payload, data, size) == 0) {
673
674                         if (ret)
675                                 *ret = o;
676
677                         if (offset)
678                                 *offset = p;
679
680                         return 1;
681                 }
682
683         next:
684                 p = le64toh(o->data.next_hash_offset);
685         }
686
687         return 0;
688 }
689
690 int journal_file_find_data_object(
691                 JournalFile *f,
692                 const void *data, uint64_t size,
693                 Object **ret, uint64_t *offset) {
694
695         uint64_t hash;
696
697         assert(f);
698         assert(data || size == 0);
699
700         hash = hash64(data, size);
701
702         return journal_file_find_data_object_with_hash(f,
703                                                        data, size, hash,
704                                                        ret, offset);
705 }
706
707 static int journal_file_append_data(
708                 JournalFile *f,
709                 const void *data, uint64_t size,
710                 Object **ret, uint64_t *offset) {
711
712         uint64_t hash, p;
713         uint64_t osize;
714         Object *o;
715         int r;
716         bool compressed = false;
717
718         assert(f);
719         assert(data || size == 0);
720
721         hash = hash64(data, size);
722
723         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
724         if (r < 0)
725                 return r;
726         else if (r > 0) {
727
728                 if (ret)
729                         *ret = o;
730
731                 if (offset)
732                         *offset = p;
733
734                 return 0;
735         }
736
737         osize = offsetof(Object, data.payload) + size;
738         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
739         if (r < 0)
740                 return r;
741
742         o->data.hash = htole64(hash);
743
744 #ifdef HAVE_XZ
745         if (f->compress &&
746             size >= COMPRESSION_SIZE_THRESHOLD) {
747                 uint64_t rsize;
748
749                 compressed = compress_blob(data, size, o->data.payload, &rsize);
750
751                 if (compressed) {
752                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
753                         o->object.flags |= OBJECT_COMPRESSED;
754
755                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
756                 }
757         }
758 #endif
759
760         if (!compressed && size > 0)
761                 memcpy(o->data.payload, data, size);
762
763         r = journal_file_link_data(f, o, p, hash);
764         if (r < 0)
765                 return r;
766
767         r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
768         if (r < 0)
769                 return r;
770
771         /* The linking might have altered the window, so let's
772          * refresh our pointer */
773         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
774         if (r < 0)
775                 return r;
776
777         if (ret)
778                 *ret = o;
779
780         if (offset)
781                 *offset = p;
782
783         return 0;
784 }
785
786 uint64_t journal_file_entry_n_items(Object *o) {
787         assert(o);
788         assert(o->object.type == OBJECT_ENTRY);
789
790         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
791 }
792
793 uint64_t journal_file_entry_array_n_items(Object *o) {
794         assert(o);
795         assert(o->object.type == OBJECT_ENTRY_ARRAY);
796
797         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
798 }
799
800 uint64_t journal_file_hash_table_n_items(Object *o) {
801         assert(o);
802         assert(o->object.type == OBJECT_DATA_HASH_TABLE ||
803                o->object.type == OBJECT_FIELD_HASH_TABLE);
804
805         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
806 }
807
808 static int link_entry_into_array(JournalFile *f,
809                                  le64_t *first,
810                                  le64_t *idx,
811                                  uint64_t p) {
812         int r;
813         uint64_t n = 0, ap = 0, q, i, a, hidx;
814         Object *o;
815
816         assert(f);
817         assert(first);
818         assert(idx);
819         assert(p > 0);
820
821         a = le64toh(*first);
822         i = hidx = le64toh(*idx);
823         while (a > 0) {
824
825                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
826                 if (r < 0)
827                         return r;
828
829                 n = journal_file_entry_array_n_items(o);
830                 if (i < n) {
831                         o->entry_array.items[i] = htole64(p);
832                         *idx = htole64(hidx + 1);
833                         return 0;
834                 }
835
836                 i -= n;
837                 ap = a;
838                 a = le64toh(o->entry_array.next_entry_array_offset);
839         }
840
841         if (hidx > n)
842                 n = (hidx+1) * 2;
843         else
844                 n = n * 2;
845
846         if (n < 4)
847                 n = 4;
848
849         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
850                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
851                                        &o, &q);
852         if (r < 0)
853                 return r;
854
855         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
856         if (r < 0)
857                 return r;
858
859         o->entry_array.items[i] = htole64(p);
860
861         if (ap == 0)
862                 *first = htole64(q);
863         else {
864                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
865                 if (r < 0)
866                         return r;
867
868                 o->entry_array.next_entry_array_offset = htole64(q);
869         }
870
871         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
872                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
873
874         *idx = htole64(hidx + 1);
875
876         return 0;
877 }
878
879 static int link_entry_into_array_plus_one(JournalFile *f,
880                                           le64_t *extra,
881                                           le64_t *first,
882                                           le64_t *idx,
883                                           uint64_t p) {
884
885         int r;
886
887         assert(f);
888         assert(extra);
889         assert(first);
890         assert(idx);
891         assert(p > 0);
892
893         if (*idx == 0)
894                 *extra = htole64(p);
895         else {
896                 le64_t i;
897
898                 i = htole64(le64toh(*idx) - 1);
899                 r = link_entry_into_array(f, first, &i, p);
900                 if (r < 0)
901                         return r;
902         }
903
904         *idx = htole64(le64toh(*idx) + 1);
905         return 0;
906 }
907
908 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
909         uint64_t p;
910         int r;
911         assert(f);
912         assert(o);
913         assert(offset > 0);
914
915         p = le64toh(o->entry.items[i].object_offset);
916         if (p == 0)
917                 return -EINVAL;
918
919         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
920         if (r < 0)
921                 return r;
922
923         return link_entry_into_array_plus_one(f,
924                                               &o->data.entry_offset,
925                                               &o->data.entry_array_offset,
926                                               &o->data.n_entries,
927                                               offset);
928 }
929
930 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
931         uint64_t n, i;
932         int r;
933
934         assert(f);
935         assert(o);
936         assert(offset > 0);
937         assert(o->object.type == OBJECT_ENTRY);
938
939         __sync_synchronize();
940
941         /* Link up the entry itself */
942         r = link_entry_into_array(f,
943                                   &f->header->entry_array_offset,
944                                   &f->header->n_entries,
945                                   offset);
946         if (r < 0)
947                 return r;
948
949         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
950
951         if (f->header->head_entry_realtime == 0)
952                 f->header->head_entry_realtime = o->entry.realtime;
953
954         f->header->tail_entry_realtime = o->entry.realtime;
955         f->header->tail_entry_monotonic = o->entry.monotonic;
956
957         f->tail_entry_monotonic_valid = true;
958
959         /* Link up the items */
960         n = journal_file_entry_n_items(o);
961         for (i = 0; i < n; i++) {
962                 r = journal_file_link_entry_item(f, o, offset, i);
963                 if (r < 0)
964                         return r;
965         }
966
967         return 0;
968 }
969
970 static int journal_file_append_entry_internal(
971                 JournalFile *f,
972                 const dual_timestamp *ts,
973                 uint64_t xor_hash,
974                 const EntryItem items[], unsigned n_items,
975                 uint64_t *seqnum,
976                 Object **ret, uint64_t *offset) {
977         uint64_t np;
978         uint64_t osize;
979         Object *o;
980         int r;
981
982         assert(f);
983         assert(items || n_items == 0);
984         assert(ts);
985
986         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
987
988         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
989         if (r < 0)
990                 return r;
991
992         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
993         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
994         o->entry.realtime = htole64(ts->realtime);
995         o->entry.monotonic = htole64(ts->monotonic);
996         o->entry.xor_hash = htole64(xor_hash);
997         o->entry.boot_id = f->header->boot_id;
998
999         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
1000         if (r < 0)
1001                 return r;
1002
1003         r = journal_file_link_entry(f, o, np);
1004         if (r < 0)
1005                 return r;
1006
1007         if (ret)
1008                 *ret = o;
1009
1010         if (offset)
1011                 *offset = np;
1012
1013         return 0;
1014 }
1015
1016 void journal_file_post_change(JournalFile *f) {
1017         assert(f);
1018
1019         /* inotify() does not receive IN_MODIFY events from file
1020          * accesses done via mmap(). After each access we hence
1021          * trigger IN_MODIFY by truncating the journal file to its
1022          * current size which triggers IN_MODIFY. */
1023
1024         __sync_synchronize();
1025
1026         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1027                 log_error("Failed to to truncate file to its own size: %m");
1028 }
1029
1030 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1031         unsigned i;
1032         EntryItem *items;
1033         int r;
1034         uint64_t xor_hash = 0;
1035         struct dual_timestamp _ts;
1036
1037         assert(f);
1038         assert(iovec || n_iovec == 0);
1039
1040         if (!f->writable)
1041                 return -EPERM;
1042
1043         if (!ts) {
1044                 dual_timestamp_get(&_ts);
1045                 ts = &_ts;
1046         }
1047
1048         if (f->tail_entry_monotonic_valid &&
1049             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1050                 return -EINVAL;
1051
1052         r = journal_file_maybe_append_tag(f, ts->realtime);
1053         if (r < 0)
1054                 return r;
1055
1056         /* alloca() can't take 0, hence let's allocate at least one */
1057         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1058
1059         for (i = 0; i < n_iovec; i++) {
1060                 uint64_t p;
1061                 Object *o;
1062
1063                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1064                 if (r < 0)
1065                         return r;
1066
1067                 xor_hash ^= le64toh(o->data.hash);
1068                 items[i].object_offset = htole64(p);
1069                 items[i].hash = o->data.hash;
1070         }
1071
1072         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1073
1074         journal_file_post_change(f);
1075
1076         return r;
1077 }
1078
1079 static int generic_array_get(JournalFile *f,
1080                              uint64_t first,
1081                              uint64_t i,
1082                              Object **ret, uint64_t *offset) {
1083
1084         Object *o;
1085         uint64_t p = 0, a;
1086         int r;
1087
1088         assert(f);
1089
1090         a = first;
1091         while (a > 0) {
1092                 uint64_t n;
1093
1094                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1095                 if (r < 0)
1096                         return r;
1097
1098                 n = journal_file_entry_array_n_items(o);
1099                 if (i < n) {
1100                         p = le64toh(o->entry_array.items[i]);
1101                         break;
1102                 }
1103
1104                 i -= n;
1105                 a = le64toh(o->entry_array.next_entry_array_offset);
1106         }
1107
1108         if (a <= 0 || p <= 0)
1109                 return 0;
1110
1111         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1112         if (r < 0)
1113                 return r;
1114
1115         if (ret)
1116                 *ret = o;
1117
1118         if (offset)
1119                 *offset = p;
1120
1121         return 1;
1122 }
1123
1124 static int generic_array_get_plus_one(JournalFile *f,
1125                                       uint64_t extra,
1126                                       uint64_t first,
1127                                       uint64_t i,
1128                                       Object **ret, uint64_t *offset) {
1129
1130         Object *o;
1131
1132         assert(f);
1133
1134         if (i == 0) {
1135                 int r;
1136
1137                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1138                 if (r < 0)
1139                         return r;
1140
1141                 if (ret)
1142                         *ret = o;
1143
1144                 if (offset)
1145                         *offset = extra;
1146
1147                 return 1;
1148         }
1149
1150         return generic_array_get(f, first, i-1, ret, offset);
1151 }
1152
1153 enum {
1154         TEST_FOUND,
1155         TEST_LEFT,
1156         TEST_RIGHT
1157 };
1158
1159 static int generic_array_bisect(JournalFile *f,
1160                                 uint64_t first,
1161                                 uint64_t n,
1162                                 uint64_t needle,
1163                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1164                                 direction_t direction,
1165                                 Object **ret,
1166                                 uint64_t *offset,
1167                                 uint64_t *idx) {
1168
1169         uint64_t a, p, t = 0, i = 0, last_p = 0;
1170         bool subtract_one = false;
1171         Object *o, *array = NULL;
1172         int r;
1173
1174         assert(f);
1175         assert(test_object);
1176
1177         a = first;
1178         while (a > 0) {
1179                 uint64_t left, right, k, lp;
1180
1181                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1182                 if (r < 0)
1183                         return r;
1184
1185                 k = journal_file_entry_array_n_items(array);
1186                 right = MIN(k, n);
1187                 if (right <= 0)
1188                         return 0;
1189
1190                 i = right - 1;
1191                 lp = p = le64toh(array->entry_array.items[i]);
1192                 if (p <= 0)
1193                         return -EBADMSG;
1194
1195                 r = test_object(f, p, needle);
1196                 if (r < 0)
1197                         return r;
1198
1199                 if (r == TEST_FOUND)
1200                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1201
1202                 if (r == TEST_RIGHT) {
1203                         left = 0;
1204                         right -= 1;
1205                         for (;;) {
1206                                 if (left == right) {
1207                                         if (direction == DIRECTION_UP)
1208                                                 subtract_one = true;
1209
1210                                         i = left;
1211                                         goto found;
1212                                 }
1213
1214                                 assert(left < right);
1215
1216                                 i = (left + right) / 2;
1217                                 p = le64toh(array->entry_array.items[i]);
1218                                 if (p <= 0)
1219                                         return -EBADMSG;
1220
1221                                 r = test_object(f, p, needle);
1222                                 if (r < 0)
1223                                         return r;
1224
1225                                 if (r == TEST_FOUND)
1226                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1227
1228                                 if (r == TEST_RIGHT)
1229                                         right = i;
1230                                 else
1231                                         left = i + 1;
1232                         }
1233                 }
1234
1235                 if (k > n) {
1236                         if (direction == DIRECTION_UP) {
1237                                 i = n;
1238                                 subtract_one = true;
1239                                 goto found;
1240                         }
1241
1242                         return 0;
1243                 }
1244
1245                 last_p = lp;
1246
1247                 n -= k;
1248                 t += k;
1249                 a = le64toh(array->entry_array.next_entry_array_offset);
1250         }
1251
1252         return 0;
1253
1254 found:
1255         if (subtract_one && t == 0 && i == 0)
1256                 return 0;
1257
1258         if (subtract_one && i == 0)
1259                 p = last_p;
1260         else if (subtract_one)
1261                 p = le64toh(array->entry_array.items[i-1]);
1262         else
1263                 p = le64toh(array->entry_array.items[i]);
1264
1265         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1266         if (r < 0)
1267                 return r;
1268
1269         if (ret)
1270                 *ret = o;
1271
1272         if (offset)
1273                 *offset = p;
1274
1275         if (idx)
1276                 *idx = t + i + (subtract_one ? -1 : 0);
1277
1278         return 1;
1279 }
1280
1281 static int generic_array_bisect_plus_one(JournalFile *f,
1282                                          uint64_t extra,
1283                                          uint64_t first,
1284                                          uint64_t n,
1285                                          uint64_t needle,
1286                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1287                                          direction_t direction,
1288                                          Object **ret,
1289                                          uint64_t *offset,
1290                                          uint64_t *idx) {
1291
1292         int r;
1293         bool step_back = false;
1294         Object *o;
1295
1296         assert(f);
1297         assert(test_object);
1298
1299         if (n <= 0)
1300                 return 0;
1301
1302         /* This bisects the array in object 'first', but first checks
1303          * an extra  */
1304         r = test_object(f, extra, needle);
1305         if (r < 0)
1306                 return r;
1307
1308         if (r == TEST_FOUND)
1309                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1310
1311         /* if we are looking with DIRECTION_UP then we need to first
1312            see if in the actual array there is a matching entry, and
1313            return the last one of that. But if there isn't any we need
1314            to return this one. Hence remember this, and return it
1315            below. */
1316         if (r == TEST_LEFT)
1317                 step_back = direction == DIRECTION_UP;
1318
1319         if (r == TEST_RIGHT) {
1320                 if (direction == DIRECTION_DOWN)
1321                         goto found;
1322                 else
1323                         return 0;
1324         }
1325
1326         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1327
1328         if (r == 0 && step_back)
1329                 goto found;
1330
1331         if (r > 0 && idx)
1332                 (*idx) ++;
1333
1334         return r;
1335
1336 found:
1337         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1338         if (r < 0)
1339                 return r;
1340
1341         if (ret)
1342                 *ret = o;
1343
1344         if (offset)
1345                 *offset = extra;
1346
1347         if (idx)
1348                 *idx = 0;
1349
1350         return 1;
1351 }
1352
1353 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1354         assert(f);
1355         assert(p > 0);
1356
1357         if (p == needle)
1358                 return TEST_FOUND;
1359         else if (p < needle)
1360                 return TEST_LEFT;
1361         else
1362                 return TEST_RIGHT;
1363 }
1364
1365 int journal_file_move_to_entry_by_offset(
1366                 JournalFile *f,
1367                 uint64_t p,
1368                 direction_t direction,
1369                 Object **ret,
1370                 uint64_t *offset) {
1371
1372         return generic_array_bisect(f,
1373                                     le64toh(f->header->entry_array_offset),
1374                                     le64toh(f->header->n_entries),
1375                                     p,
1376                                     test_object_offset,
1377                                     direction,
1378                                     ret, offset, NULL);
1379 }
1380
1381
1382 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1383         Object *o;
1384         int r;
1385
1386         assert(f);
1387         assert(p > 0);
1388
1389         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1390         if (r < 0)
1391                 return r;
1392
1393         if (le64toh(o->entry.seqnum) == needle)
1394                 return TEST_FOUND;
1395         else if (le64toh(o->entry.seqnum) < needle)
1396                 return TEST_LEFT;
1397         else
1398                 return TEST_RIGHT;
1399 }
1400
1401 int journal_file_move_to_entry_by_seqnum(
1402                 JournalFile *f,
1403                 uint64_t seqnum,
1404                 direction_t direction,
1405                 Object **ret,
1406                 uint64_t *offset) {
1407
1408         return generic_array_bisect(f,
1409                                     le64toh(f->header->entry_array_offset),
1410                                     le64toh(f->header->n_entries),
1411                                     seqnum,
1412                                     test_object_seqnum,
1413                                     direction,
1414                                     ret, offset, NULL);
1415 }
1416
1417 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1418         Object *o;
1419         int r;
1420
1421         assert(f);
1422         assert(p > 0);
1423
1424         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1425         if (r < 0)
1426                 return r;
1427
1428         if (le64toh(o->entry.realtime) == needle)
1429                 return TEST_FOUND;
1430         else if (le64toh(o->entry.realtime) < needle)
1431                 return TEST_LEFT;
1432         else
1433                 return TEST_RIGHT;
1434 }
1435
1436 int journal_file_move_to_entry_by_realtime(
1437                 JournalFile *f,
1438                 uint64_t realtime,
1439                 direction_t direction,
1440                 Object **ret,
1441                 uint64_t *offset) {
1442
1443         return generic_array_bisect(f,
1444                                     le64toh(f->header->entry_array_offset),
1445                                     le64toh(f->header->n_entries),
1446                                     realtime,
1447                                     test_object_realtime,
1448                                     direction,
1449                                     ret, offset, NULL);
1450 }
1451
1452 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1453         Object *o;
1454         int r;
1455
1456         assert(f);
1457         assert(p > 0);
1458
1459         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1460         if (r < 0)
1461                 return r;
1462
1463         if (le64toh(o->entry.monotonic) == needle)
1464                 return TEST_FOUND;
1465         else if (le64toh(o->entry.monotonic) < needle)
1466                 return TEST_LEFT;
1467         else
1468                 return TEST_RIGHT;
1469 }
1470
1471 int journal_file_move_to_entry_by_monotonic(
1472                 JournalFile *f,
1473                 sd_id128_t boot_id,
1474                 uint64_t monotonic,
1475                 direction_t direction,
1476                 Object **ret,
1477                 uint64_t *offset) {
1478
1479         char t[9+32+1] = "_BOOT_ID=";
1480         Object *o;
1481         int r;
1482
1483         assert(f);
1484
1485         sd_id128_to_string(boot_id, t + 9);
1486         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1487         if (r < 0)
1488                 return r;
1489         if (r == 0)
1490                 return -ENOENT;
1491
1492         return generic_array_bisect_plus_one(f,
1493                                              le64toh(o->data.entry_offset),
1494                                              le64toh(o->data.entry_array_offset),
1495                                              le64toh(o->data.n_entries),
1496                                              monotonic,
1497                                              test_object_monotonic,
1498                                              direction,
1499                                              ret, offset, NULL);
1500 }
1501
1502 int journal_file_next_entry(
1503                 JournalFile *f,
1504                 Object *o, uint64_t p,
1505                 direction_t direction,
1506                 Object **ret, uint64_t *offset) {
1507
1508         uint64_t i, n;
1509         int r;
1510
1511         assert(f);
1512         assert(p > 0 || !o);
1513
1514         n = le64toh(f->header->n_entries);
1515         if (n <= 0)
1516                 return 0;
1517
1518         if (!o)
1519                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1520         else {
1521                 if (o->object.type != OBJECT_ENTRY)
1522                         return -EINVAL;
1523
1524                 r = generic_array_bisect(f,
1525                                          le64toh(f->header->entry_array_offset),
1526                                          le64toh(f->header->n_entries),
1527                                          p,
1528                                          test_object_offset,
1529                                          DIRECTION_DOWN,
1530                                          NULL, NULL,
1531                                          &i);
1532                 if (r <= 0)
1533                         return r;
1534
1535                 if (direction == DIRECTION_DOWN) {
1536                         if (i >= n - 1)
1537                                 return 0;
1538
1539                         i++;
1540                 } else {
1541                         if (i <= 0)
1542                                 return 0;
1543
1544                         i--;
1545                 }
1546         }
1547
1548         /* And jump to it */
1549         return generic_array_get(f,
1550                                  le64toh(f->header->entry_array_offset),
1551                                  i,
1552                                  ret, offset);
1553 }
1554
1555 int journal_file_skip_entry(
1556                 JournalFile *f,
1557                 Object *o, uint64_t p,
1558                 int64_t skip,
1559                 Object **ret, uint64_t *offset) {
1560
1561         uint64_t i, n;
1562         int r;
1563
1564         assert(f);
1565         assert(o);
1566         assert(p > 0);
1567
1568         if (o->object.type != OBJECT_ENTRY)
1569                 return -EINVAL;
1570
1571         r = generic_array_bisect(f,
1572                                  le64toh(f->header->entry_array_offset),
1573                                  le64toh(f->header->n_entries),
1574                                  p,
1575                                  test_object_offset,
1576                                  DIRECTION_DOWN,
1577                                  NULL, NULL,
1578                                  &i);
1579         if (r <= 0)
1580                 return r;
1581
1582         /* Calculate new index */
1583         if (skip < 0) {
1584                 if ((uint64_t) -skip >= i)
1585                         i = 0;
1586                 else
1587                         i = i - (uint64_t) -skip;
1588         } else
1589                 i  += (uint64_t) skip;
1590
1591         n = le64toh(f->header->n_entries);
1592         if (n <= 0)
1593                 return -EBADMSG;
1594
1595         if (i >= n)
1596                 i = n-1;
1597
1598         return generic_array_get(f,
1599                                  le64toh(f->header->entry_array_offset),
1600                                  i,
1601                                  ret, offset);
1602 }
1603
1604 int journal_file_next_entry_for_data(
1605                 JournalFile *f,
1606                 Object *o, uint64_t p,
1607                 uint64_t data_offset,
1608                 direction_t direction,
1609                 Object **ret, uint64_t *offset) {
1610
1611         uint64_t n, i;
1612         int r;
1613         Object *d;
1614
1615         assert(f);
1616         assert(p > 0 || !o);
1617
1618         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1619         if (r < 0)
1620                 return r;
1621
1622         n = le64toh(d->data.n_entries);
1623         if (n <= 0)
1624                 return n;
1625
1626         if (!o)
1627                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1628         else {
1629                 if (o->object.type != OBJECT_ENTRY)
1630                         return -EINVAL;
1631
1632                 r = generic_array_bisect_plus_one(f,
1633                                                   le64toh(d->data.entry_offset),
1634                                                   le64toh(d->data.entry_array_offset),
1635                                                   le64toh(d->data.n_entries),
1636                                                   p,
1637                                                   test_object_offset,
1638                                                   DIRECTION_DOWN,
1639                                                   NULL, NULL,
1640                                                   &i);
1641
1642                 if (r <= 0)
1643                         return r;
1644
1645                 if (direction == DIRECTION_DOWN) {
1646                         if (i >= n - 1)
1647                                 return 0;
1648
1649                         i++;
1650                 } else {
1651                         if (i <= 0)
1652                                 return 0;
1653
1654                         i--;
1655                 }
1656
1657         }
1658
1659         return generic_array_get_plus_one(f,
1660                                           le64toh(d->data.entry_offset),
1661                                           le64toh(d->data.entry_array_offset),
1662                                           i,
1663                                           ret, offset);
1664 }
1665
1666 int journal_file_move_to_entry_by_offset_for_data(
1667                 JournalFile *f,
1668                 uint64_t data_offset,
1669                 uint64_t p,
1670                 direction_t direction,
1671                 Object **ret, uint64_t *offset) {
1672
1673         int r;
1674         Object *d;
1675
1676         assert(f);
1677
1678         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1679         if (r < 0)
1680                 return r;
1681
1682         return generic_array_bisect_plus_one(f,
1683                                              le64toh(d->data.entry_offset),
1684                                              le64toh(d->data.entry_array_offset),
1685                                              le64toh(d->data.n_entries),
1686                                              p,
1687                                              test_object_offset,
1688                                              direction,
1689                                              ret, offset, NULL);
1690 }
1691
1692 int journal_file_move_to_entry_by_monotonic_for_data(
1693                 JournalFile *f,
1694                 uint64_t data_offset,
1695                 sd_id128_t boot_id,
1696                 uint64_t monotonic,
1697                 direction_t direction,
1698                 Object **ret, uint64_t *offset) {
1699
1700         char t[9+32+1] = "_BOOT_ID=";
1701         Object *o, *d;
1702         int r;
1703         uint64_t b, z;
1704
1705         assert(f);
1706
1707         /* First, seek by time */
1708         sd_id128_to_string(boot_id, t + 9);
1709         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1710         if (r < 0)
1711                 return r;
1712         if (r == 0)
1713                 return -ENOENT;
1714
1715         r = generic_array_bisect_plus_one(f,
1716                                           le64toh(o->data.entry_offset),
1717                                           le64toh(o->data.entry_array_offset),
1718                                           le64toh(o->data.n_entries),
1719                                           monotonic,
1720                                           test_object_monotonic,
1721                                           direction,
1722                                           NULL, &z, NULL);
1723         if (r <= 0)
1724                 return r;
1725
1726         /* And now, continue seeking until we find an entry that
1727          * exists in both bisection arrays */
1728
1729         for (;;) {
1730                 Object *qo;
1731                 uint64_t p, q;
1732
1733                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1734                 if (r < 0)
1735                         return r;
1736
1737                 r = generic_array_bisect_plus_one(f,
1738                                                   le64toh(d->data.entry_offset),
1739                                                   le64toh(d->data.entry_array_offset),
1740                                                   le64toh(d->data.n_entries),
1741                                                   z,
1742                                                   test_object_offset,
1743                                                   direction,
1744                                                   NULL, &p, NULL);
1745                 if (r <= 0)
1746                         return r;
1747
1748                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1749                 if (r < 0)
1750                         return r;
1751
1752                 r = generic_array_bisect_plus_one(f,
1753                                                   le64toh(o->data.entry_offset),
1754                                                   le64toh(o->data.entry_array_offset),
1755                                                   le64toh(o->data.n_entries),
1756                                                   p,
1757                                                   test_object_offset,
1758                                                   direction,
1759                                                   &qo, &q, NULL);
1760
1761                 if (r <= 0)
1762                         return r;
1763
1764                 if (p == q) {
1765                         if (ret)
1766                                 *ret = qo;
1767                         if (offset)
1768                                 *offset = q;
1769
1770                         return 1;
1771                 }
1772
1773                 z = q;
1774         }
1775
1776         return 0;
1777 }
1778
1779 int journal_file_move_to_entry_by_seqnum_for_data(
1780                 JournalFile *f,
1781                 uint64_t data_offset,
1782                 uint64_t seqnum,
1783                 direction_t direction,
1784                 Object **ret, uint64_t *offset) {
1785
1786         Object *d;
1787         int r;
1788
1789         assert(f);
1790
1791         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1792         if (r < 0)
1793                 return r;
1794
1795         return generic_array_bisect_plus_one(f,
1796                                              le64toh(d->data.entry_offset),
1797                                              le64toh(d->data.entry_array_offset),
1798                                              le64toh(d->data.n_entries),
1799                                              seqnum,
1800                                              test_object_seqnum,
1801                                              direction,
1802                                              ret, offset, NULL);
1803 }
1804
1805 int journal_file_move_to_entry_by_realtime_for_data(
1806                 JournalFile *f,
1807                 uint64_t data_offset,
1808                 uint64_t realtime,
1809                 direction_t direction,
1810                 Object **ret, uint64_t *offset) {
1811
1812         Object *d;
1813         int r;
1814
1815         assert(f);
1816
1817         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1818         if (r < 0)
1819                 return r;
1820
1821         return generic_array_bisect_plus_one(f,
1822                                              le64toh(d->data.entry_offset),
1823                                              le64toh(d->data.entry_array_offset),
1824                                              le64toh(d->data.n_entries),
1825                                              realtime,
1826                                              test_object_realtime,
1827                                              direction,
1828                                              ret, offset, NULL);
1829 }
1830
1831 void journal_file_dump(JournalFile *f) {
1832         Object *o;
1833         int r;
1834         uint64_t p;
1835
1836         assert(f);
1837
1838         journal_file_print_header(f);
1839
1840         p = le64toh(f->header->header_size);
1841         while (p != 0) {
1842                 r = journal_file_move_to_object(f, -1, p, &o);
1843                 if (r < 0)
1844                         goto fail;
1845
1846                 switch (o->object.type) {
1847
1848                 case OBJECT_UNUSED:
1849                         printf("Type: OBJECT_UNUSED\n");
1850                         break;
1851
1852                 case OBJECT_DATA:
1853                         printf("Type: OBJECT_DATA\n");
1854                         break;
1855
1856                 case OBJECT_ENTRY:
1857                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
1858                                (unsigned long long) le64toh(o->entry.seqnum),
1859                                (unsigned long long) le64toh(o->entry.monotonic),
1860                                (unsigned long long) le64toh(o->entry.realtime));
1861                         break;
1862
1863                 case OBJECT_FIELD_HASH_TABLE:
1864                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1865                         break;
1866
1867                 case OBJECT_DATA_HASH_TABLE:
1868                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1869                         break;
1870
1871                 case OBJECT_ENTRY_ARRAY:
1872                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1873                         break;
1874
1875                 case OBJECT_TAG:
1876                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1877                                (unsigned long long) le64toh(o->tag.seqnum),
1878                                (unsigned long long) le64toh(o->tag.epoch));
1879                         break;
1880                 }
1881
1882                 if (o->object.flags & OBJECT_COMPRESSED)
1883                         printf("Flags: COMPRESSED\n");
1884
1885                 if (p == le64toh(f->header->tail_object_offset))
1886                         p = 0;
1887                 else
1888                         p = p + ALIGN64(le64toh(o->object.size));
1889         }
1890
1891         return;
1892 fail:
1893         log_error("File corrupt");
1894 }
1895
1896 void journal_file_print_header(JournalFile *f) {
1897         char a[33], b[33], c[33];
1898         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1899
1900         assert(f);
1901
1902         printf("File Path: %s\n"
1903                "File ID: %s\n"
1904                "Machine ID: %s\n"
1905                "Boot ID: %s\n"
1906                "Sequential Number ID: %s\n"
1907                "State: %s\n"
1908                "Compatible Flags:%s%s\n"
1909                "Incompatible Flags:%s%s\n"
1910                "Header size: %llu\n"
1911                "Arena size: %llu\n"
1912                "Data Hash Table Size: %llu\n"
1913                "Field Hash Table Size: %llu\n"
1914                "Rotate Suggested: %s\n"
1915                "Head Sequential Number: %llu\n"
1916                "Tail Sequential Number: %llu\n"
1917                "Head Realtime Timestamp: %s\n"
1918                "Tail Realtime Timestamp: %s\n"
1919                "Objects: %llu\n"
1920                "Entry Objects: %llu\n",
1921                f->path,
1922                sd_id128_to_string(f->header->file_id, a),
1923                sd_id128_to_string(f->header->machine_id, b),
1924                sd_id128_to_string(f->header->boot_id, c),
1925                sd_id128_to_string(f->header->seqnum_id, c),
1926                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1927                f->header->state == STATE_ONLINE ? "ONLINE" :
1928                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1929                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
1930                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1931                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
1932                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1933                (unsigned long long) le64toh(f->header->header_size),
1934                (unsigned long long) le64toh(f->header->arena_size),
1935                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1936                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1937                yes_no(journal_file_rotate_suggested(f)),
1938                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1939                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1940                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1941                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1942                (unsigned long long) le64toh(f->header->n_objects),
1943                (unsigned long long) le64toh(f->header->n_entries));
1944
1945         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1946                 printf("Data Objects: %llu\n"
1947                        "Data Hash Table Fill: %.1f%%\n",
1948                        (unsigned long long) le64toh(f->header->n_data),
1949                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1950
1951         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1952                 printf("Field Objects: %llu\n"
1953                        "Field Hash Table Fill: %.1f%%\n",
1954                        (unsigned long long) le64toh(f->header->n_fields),
1955                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1956
1957         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
1958                 printf("Tag Objects: %llu\n",
1959                        (unsigned long long) le64toh(f->header->n_tags));
1960         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1961                 printf("Entry Array Objects: %llu\n",
1962                        (unsigned long long) le64toh(f->header->n_entry_arrays));
1963 }
1964
1965 int journal_file_open(
1966                 const char *fname,
1967                 int flags,
1968                 mode_t mode,
1969                 bool compress,
1970                 bool seal,
1971                 JournalMetrics *metrics,
1972                 MMapCache *mmap_cache,
1973                 JournalFile *template,
1974                 JournalFile **ret) {
1975
1976         JournalFile *f;
1977         int r;
1978         bool newly_created = false;
1979
1980         assert(fname);
1981
1982         if ((flags & O_ACCMODE) != O_RDONLY &&
1983             (flags & O_ACCMODE) != O_RDWR)
1984                 return -EINVAL;
1985
1986         if (!endswith(fname, ".journal") &&
1987             !endswith(fname, ".journal~"))
1988                 return -EINVAL;
1989
1990         f = new0(JournalFile, 1);
1991         if (!f)
1992                 return -ENOMEM;
1993
1994         f->fd = -1;
1995         f->mode = mode;
1996
1997         f->flags = flags;
1998         f->prot = prot_from_flags(flags);
1999         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2000         f->compress = compress;
2001         f->seal = seal;
2002
2003         if (mmap_cache)
2004                 f->mmap = mmap_cache_ref(mmap_cache);
2005         else {
2006                 f->mmap = mmap_cache_new();
2007                 if (!f->mmap) {
2008                         r = -ENOMEM;
2009                         goto fail;
2010                 }
2011         }
2012
2013         f->path = strdup(fname);
2014         if (!f->path) {
2015                 r = -ENOMEM;
2016                 goto fail;
2017         }
2018
2019         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2020         if (f->fd < 0) {
2021                 r = -errno;
2022                 goto fail;
2023         }
2024
2025         if (fstat(f->fd, &f->last_stat) < 0) {
2026                 r = -errno;
2027                 goto fail;
2028         }
2029
2030         if (f->last_stat.st_size == 0 && f->writable) {
2031                 newly_created = true;
2032
2033                 /* Try to load the FSPRG state, and if we can't, then
2034                  * just don't do sealing */
2035                 r = journal_file_fss_load(f);
2036                 if (r < 0)
2037                         f->seal = false;
2038
2039                 r = journal_file_init_header(f, template);
2040                 if (r < 0)
2041                         goto fail;
2042
2043                 if (fstat(f->fd, &f->last_stat) < 0) {
2044                         r = -errno;
2045                         goto fail;
2046                 }
2047         }
2048
2049         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2050                 r = -EIO;
2051                 goto fail;
2052         }
2053
2054         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2055         if (f->header == MAP_FAILED) {
2056                 f->header = NULL;
2057                 r = -errno;
2058                 goto fail;
2059         }
2060
2061         if (!newly_created) {
2062                 r = journal_file_verify_header(f);
2063                 if (r < 0)
2064                         goto fail;
2065         }
2066
2067         if (!newly_created && f->writable) {
2068                 r = journal_file_fss_load(f);
2069                 if (r < 0)
2070                         goto fail;
2071         }
2072
2073         if (f->writable) {
2074                 if (metrics) {
2075                         journal_default_metrics(metrics, f->fd);
2076                         f->metrics = *metrics;
2077                 } else if (template)
2078                         f->metrics = template->metrics;
2079
2080                 r = journal_file_refresh_header(f);
2081                 if (r < 0)
2082                         goto fail;
2083         }
2084
2085         r = journal_file_hmac_setup(f);
2086         if (r < 0)
2087                 goto fail;
2088
2089         if (newly_created) {
2090                 r = journal_file_setup_field_hash_table(f);
2091                 if (r < 0)
2092                         goto fail;
2093
2094                 r = journal_file_setup_data_hash_table(f);
2095                 if (r < 0)
2096                         goto fail;
2097
2098                 r = journal_file_append_first_tag(f);
2099                 if (r < 0)
2100                         goto fail;
2101         }
2102
2103         r = journal_file_map_field_hash_table(f);
2104         if (r < 0)
2105                 goto fail;
2106
2107         r = journal_file_map_data_hash_table(f);
2108         if (r < 0)
2109                 goto fail;
2110
2111         if (ret)
2112                 *ret = f;
2113
2114         return 0;
2115
2116 fail:
2117         journal_file_close(f);
2118
2119         return r;
2120 }
2121
2122 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2123         char *p;
2124         size_t l;
2125         JournalFile *old_file, *new_file = NULL;
2126         int r;
2127
2128         assert(f);
2129         assert(*f);
2130
2131         old_file = *f;
2132
2133         if (!old_file->writable)
2134                 return -EINVAL;
2135
2136         if (!endswith(old_file->path, ".journal"))
2137                 return -EINVAL;
2138
2139         l = strlen(old_file->path);
2140
2141         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2142         if (!p)
2143                 return -ENOMEM;
2144
2145         memcpy(p, old_file->path, l - 8);
2146         p[l-8] = '@';
2147         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2148         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2149                  "-%016llx-%016llx.journal",
2150                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2151                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2152
2153         r = rename(old_file->path, p);
2154         free(p);
2155
2156         if (r < 0)
2157                 return -errno;
2158
2159         old_file->header->state = STATE_ARCHIVED;
2160
2161         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2162         journal_file_close(old_file);
2163
2164         *f = new_file;
2165         return r;
2166 }
2167
2168 int journal_file_open_reliably(
2169                 const char *fname,
2170                 int flags,
2171                 mode_t mode,
2172                 bool compress,
2173                 bool seal,
2174                 JournalMetrics *metrics,
2175                 MMapCache *mmap_cache,
2176                 JournalFile *template,
2177                 JournalFile **ret) {
2178
2179         int r;
2180         size_t l;
2181         char *p;
2182
2183         r = journal_file_open(fname, flags, mode, compress, seal,
2184                               metrics, mmap_cache, template, ret);
2185         if (r != -EBADMSG && /* corrupted */
2186             r != -ENODATA && /* truncated */
2187             r != -EHOSTDOWN && /* other machine */
2188             r != -EPROTONOSUPPORT && /* incompatible feature */
2189             r != -EBUSY && /* unclean shutdown */
2190             r != -ESHUTDOWN /* already archived */)
2191                 return r;
2192
2193         if ((flags & O_ACCMODE) == O_RDONLY)
2194                 return r;
2195
2196         if (!(flags & O_CREAT))
2197                 return r;
2198
2199         if (!endswith(fname, ".journal"))
2200                 return r;
2201
2202         /* The file is corrupted. Rotate it away and try it again (but only once) */
2203
2204         l = strlen(fname);
2205         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2206                      (int) (l-8), fname,
2207                      (unsigned long long) now(CLOCK_REALTIME),
2208                      random_ull()) < 0)
2209                 return -ENOMEM;
2210
2211         r = rename(fname, p);
2212         free(p);
2213         if (r < 0)
2214                 return -errno;
2215
2216         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2217
2218         return journal_file_open(fname, flags, mode, compress, seal,
2219                                  metrics, mmap_cache, template, ret);
2220 }
2221
2222
2223 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2224         uint64_t i, n;
2225         uint64_t q, xor_hash = 0;
2226         int r;
2227         EntryItem *items;
2228         dual_timestamp ts;
2229
2230         assert(from);
2231         assert(to);
2232         assert(o);
2233         assert(p);
2234
2235         if (!to->writable)
2236                 return -EPERM;
2237
2238         ts.monotonic = le64toh(o->entry.monotonic);
2239         ts.realtime = le64toh(o->entry.realtime);
2240
2241         if (to->tail_entry_monotonic_valid &&
2242             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2243                 return -EINVAL;
2244
2245         n = journal_file_entry_n_items(o);
2246         items = alloca(sizeof(EntryItem) * n);
2247
2248         for (i = 0; i < n; i++) {
2249                 uint64_t l, h;
2250                 le64_t le_hash;
2251                 size_t t;
2252                 void *data;
2253                 Object *u;
2254
2255                 q = le64toh(o->entry.items[i].object_offset);
2256                 le_hash = o->entry.items[i].hash;
2257
2258                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2259                 if (r < 0)
2260                         return r;
2261
2262                 if (le_hash != o->data.hash)
2263                         return -EBADMSG;
2264
2265                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2266                 t = (size_t) l;
2267
2268                 /* We hit the limit on 32bit machines */
2269                 if ((uint64_t) t != l)
2270                         return -E2BIG;
2271
2272                 if (o->object.flags & OBJECT_COMPRESSED) {
2273 #ifdef HAVE_XZ
2274                         uint64_t rsize;
2275
2276                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2277                                 return -EBADMSG;
2278
2279                         data = from->compress_buffer;
2280                         l = rsize;
2281 #else
2282                         return -EPROTONOSUPPORT;
2283 #endif
2284                 } else
2285                         data = o->data.payload;
2286
2287                 r = journal_file_append_data(to, data, l, &u, &h);
2288                 if (r < 0)
2289                         return r;
2290
2291                 xor_hash ^= le64toh(u->data.hash);
2292                 items[i].object_offset = htole64(h);
2293                 items[i].hash = u->data.hash;
2294
2295                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2296                 if (r < 0)
2297                         return r;
2298         }
2299
2300         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2301 }
2302
2303 void journal_default_metrics(JournalMetrics *m, int fd) {
2304         uint64_t fs_size = 0;
2305         struct statvfs ss;
2306         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2307
2308         assert(m);
2309         assert(fd >= 0);
2310
2311         if (fstatvfs(fd, &ss) >= 0)
2312                 fs_size = ss.f_frsize * ss.f_blocks;
2313
2314         if (m->max_use == (uint64_t) -1) {
2315
2316                 if (fs_size > 0) {
2317                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2318
2319                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2320                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2321
2322                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2323                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2324                 } else
2325                         m->max_use = DEFAULT_MAX_USE_LOWER;
2326         } else {
2327                 m->max_use = PAGE_ALIGN(m->max_use);
2328
2329                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2330                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2331         }
2332
2333         if (m->max_size == (uint64_t) -1) {
2334                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2335
2336                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2337                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2338         } else
2339                 m->max_size = PAGE_ALIGN(m->max_size);
2340
2341         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2342                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2343
2344         if (m->max_size*2 > m->max_use)
2345                 m->max_use = m->max_size*2;
2346
2347         if (m->min_size == (uint64_t) -1)
2348                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2349         else {
2350                 m->min_size = PAGE_ALIGN(m->min_size);
2351
2352                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2353                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2354
2355                 if (m->min_size > m->max_size)
2356                         m->max_size = m->min_size;
2357         }
2358
2359         if (m->keep_free == (uint64_t) -1) {
2360
2361                 if (fs_size > 0) {
2362                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2363
2364                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2365                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2366
2367                 } else
2368                         m->keep_free = DEFAULT_KEEP_FREE;
2369         }
2370
2371         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2372                  format_bytes(a, sizeof(a), m->max_use),
2373                  format_bytes(b, sizeof(b), m->max_size),
2374                  format_bytes(c, sizeof(c), m->min_size),
2375                  format_bytes(d, sizeof(d), m->keep_free));
2376 }
2377
2378 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2379         assert(f);
2380         assert(from || to);
2381
2382         if (from) {
2383                 if (f->header->head_entry_realtime == 0)
2384                         return -ENOENT;
2385
2386                 *from = le64toh(f->header->head_entry_realtime);
2387         }
2388
2389         if (to) {
2390                 if (f->header->tail_entry_realtime == 0)
2391                         return -ENOENT;
2392
2393                 *to = le64toh(f->header->tail_entry_realtime);
2394         }
2395
2396         return 1;
2397 }
2398
2399 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2400         char t[9+32+1] = "_BOOT_ID=";
2401         Object *o;
2402         uint64_t p;
2403         int r;
2404
2405         assert(f);
2406         assert(from || to);
2407
2408         sd_id128_to_string(boot_id, t + 9);
2409
2410         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2411         if (r <= 0)
2412                 return r;
2413
2414         if (le64toh(o->data.n_entries) <= 0)
2415                 return 0;
2416
2417         if (from) {
2418                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2419                 if (r < 0)
2420                         return r;
2421
2422                 *from = le64toh(o->entry.monotonic);
2423         }
2424
2425         if (to) {
2426                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2427                 if (r < 0)
2428                         return r;
2429
2430                 r = generic_array_get_plus_one(f,
2431                                                le64toh(o->data.entry_offset),
2432                                                le64toh(o->data.entry_array_offset),
2433                                                le64toh(o->data.n_entries)-1,
2434                                                &o, NULL);
2435                 if (r <= 0)
2436                         return r;
2437
2438                 *to = le64toh(o->entry.monotonic);
2439         }
2440
2441         return 1;
2442 }
2443
2444 bool journal_file_rotate_suggested(JournalFile *f) {
2445         assert(f);
2446
2447         /* If we gained new header fields we gained new features,
2448          * hence suggest a rotation */
2449         if (le64toh(f->header->header_size) < sizeof(Header)) {
2450                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2451                 return true;
2452         }
2453
2454         /* Let's check if the hash tables grew over a certain fill
2455          * level (75%, borrowing this value from Java's hash table
2456          * implementation), and if so suggest a rotation. To calculate
2457          * the fill level we need the n_data field, which only exists
2458          * in newer versions. */
2459
2460         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2461                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2462                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2463                                   f->path,
2464                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2465                                   (unsigned long long) le64toh(f->header->n_data),
2466                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2467                                   (unsigned long long) (f->last_stat.st_size),
2468                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2469                         return true;
2470                 }
2471
2472         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2473                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2474                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2475                                   f->path,
2476                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2477                                   (unsigned long long) le64toh(f->header->n_fields),
2478                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2479                         return true;
2480                 }
2481
2482         return false;
2483 }