chiark / gitweb /
mmap: resize arrays dynamically
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 void journal_file_close(JournalFile *f) {
65         assert(f);
66
67         /* Write the final tag */
68         if (f->seal && f->writable)
69                 journal_file_append_tag(f);
70
71         /* Sync everything to disk, before we mark the file offline */
72         if (f->mmap && f->fd >= 0)
73                 mmap_cache_close_fd(f->mmap, f->fd);
74
75         if (f->writable && f->fd >= 0)
76                 fdatasync(f->fd);
77
78         if (f->header) {
79                 /* Mark the file offline. Don't override the archived state if it already is set */
80                 if (f->writable && f->header->state == STATE_ONLINE)
81                         f->header->state = STATE_OFFLINE;
82
83                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
84         }
85
86         if (f->fd >= 0)
87                 close_nointr_nofail(f->fd);
88
89         free(f->path);
90
91         if (f->mmap)
92                 mmap_cache_unref(f->mmap);
93
94 #ifdef HAVE_XZ
95         free(f->compress_buffer);
96 #endif
97
98 #ifdef HAVE_GCRYPT
99         if (f->fss_file)
100                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
101         else if (f->fsprg_state)
102                 free(f->fsprg_state);
103
104         free(f->fsprg_seed);
105
106         if (f->hmac)
107                 gcry_md_close(f->hmac);
108 #endif
109
110         free(f);
111 }
112
113 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
114         Header h;
115         ssize_t k;
116         int r;
117
118         assert(f);
119
120         zero(h);
121         memcpy(h.signature, HEADER_SIGNATURE, 8);
122         h.header_size = htole64(ALIGN64(sizeof(h)));
123
124         h.incompatible_flags =
125                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
126
127         h.compatible_flags =
128                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
129
130         r = sd_id128_randomize(&h.file_id);
131         if (r < 0)
132                 return r;
133
134         if (template) {
135                 h.seqnum_id = template->header->seqnum_id;
136                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
137         } else
138                 h.seqnum_id = h.file_id;
139
140         k = pwrite(f->fd, &h, sizeof(h), 0);
141         if (k < 0)
142                 return -errno;
143
144         if (k != sizeof(h))
145                 return -EIO;
146
147         return 0;
148 }
149
150 static int journal_file_refresh_header(JournalFile *f) {
151         int r;
152         sd_id128_t boot_id;
153
154         assert(f);
155
156         r = sd_id128_get_machine(&f->header->machine_id);
157         if (r < 0)
158                 return r;
159
160         r = sd_id128_get_boot(&boot_id);
161         if (r < 0)
162                 return r;
163
164         if (sd_id128_equal(boot_id, f->header->boot_id))
165                 f->tail_entry_monotonic_valid = true;
166
167         f->header->boot_id = boot_id;
168
169         f->header->state = STATE_ONLINE;
170
171         /* Sync the online state to disk */
172         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
173         fdatasync(f->fd);
174
175         return 0;
176 }
177
178 static int journal_file_verify_header(JournalFile *f) {
179         assert(f);
180
181         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
182                 return -EBADMSG;
183
184         /* In both read and write mode we refuse to open files with
185          * incompatible flags we don't know */
186 #ifdef HAVE_XZ
187         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
188                 return -EPROTONOSUPPORT;
189 #else
190         if (f->header->incompatible_flags != 0)
191                 return -EPROTONOSUPPORT;
192 #endif
193
194         /* When open for writing we refuse to open files with
195          * compatible flags, too */
196         if (f->writable) {
197 #ifdef HAVE_GCRYPT
198                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
199                         return -EPROTONOSUPPORT;
200 #else
201                 if (f->header->compatible_flags != 0)
202                         return -EPROTONOSUPPORT;
203 #endif
204         }
205
206         if (f->header->state >= _STATE_MAX)
207                 return -EBADMSG;
208
209         /* The first addition was n_data, so check that we are at least this large */
210         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
211                 return -EBADMSG;
212
213         if ((le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_SEALED) &&
214                 !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
215                 return -EBADMSG;
216
217         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
218                 return -ENODATA;
219
220         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
221                 return -ENODATA;
222
223         if (!VALID64(f->header->data_hash_table_offset) ||
224             !VALID64(f->header->field_hash_table_offset) ||
225             !VALID64(f->header->tail_object_offset) ||
226             !VALID64(f->header->entry_array_offset))
227                 return -ENODATA;
228
229         if (f->writable) {
230                 uint8_t state;
231                 sd_id128_t machine_id;
232                 int r;
233
234                 r = sd_id128_get_machine(&machine_id);
235                 if (r < 0)
236                         return r;
237
238                 if (!sd_id128_equal(machine_id, f->header->machine_id))
239                         return -EHOSTDOWN;
240
241                 state = f->header->state;
242
243                 if (state == STATE_ONLINE) {
244                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
245                         return -EBUSY;
246                 } else if (state == STATE_ARCHIVED)
247                         return -ESHUTDOWN;
248                 else if (state != STATE_OFFLINE) {
249                         log_debug("Journal file %s has unknown state %u.", f->path, state);
250                         return -EBUSY;
251                 }
252         }
253
254         f->compress = !!(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED);
255
256         if (f->writable)
257                 f->seal = !!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_SEALED);
258
259         return 0;
260 }
261
262 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
263         uint64_t old_size, new_size;
264         int r;
265
266         assert(f);
267
268         /* We assume that this file is not sparse, and we know that
269          * for sure, since we always call posix_fallocate()
270          * ourselves */
271
272         old_size =
273                 le64toh(f->header->header_size) +
274                 le64toh(f->header->arena_size);
275
276         new_size = PAGE_ALIGN(offset + size);
277         if (new_size < le64toh(f->header->header_size))
278                 new_size = le64toh(f->header->header_size);
279
280         if (new_size <= old_size)
281                 return 0;
282
283         if (f->metrics.max_size > 0 &&
284             new_size > f->metrics.max_size)
285                 return -E2BIG;
286
287         if (new_size > f->metrics.min_size &&
288             f->metrics.keep_free > 0) {
289                 struct statvfs svfs;
290
291                 if (fstatvfs(f->fd, &svfs) >= 0) {
292                         uint64_t available;
293
294                         available = svfs.f_bfree * svfs.f_bsize;
295
296                         if (available >= f->metrics.keep_free)
297                                 available -= f->metrics.keep_free;
298                         else
299                                 available = 0;
300
301                         if (new_size - old_size > available)
302                                 return -E2BIG;
303                 }
304         }
305
306         /* Note that the glibc fallocate() fallback is very
307            inefficient, hence we try to minimize the allocation area
308            as we can. */
309         r = posix_fallocate(f->fd, old_size, new_size - old_size);
310         if (r != 0)
311                 return -r;
312
313         mmap_cache_close_fd_range(f->mmap, f->fd, old_size);
314
315         if (fstat(f->fd, &f->last_stat) < 0)
316                 return -errno;
317
318         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
319
320         return 0;
321 }
322
323 static int journal_file_move_to(JournalFile *f, int context, uint64_t offset, uint64_t size, void **ret) {
324         assert(f);
325         assert(ret);
326
327         /* Avoid SIGBUS on invalid accesses */
328         if (offset + size > (uint64_t) f->last_stat.st_size) {
329                 /* Hmm, out of range? Let's refresh the fstat() data
330                  * first, before we trust that check. */
331
332                 if (fstat(f->fd, &f->last_stat) < 0 ||
333                     offset + size > (uint64_t) f->last_stat.st_size)
334                         return -EADDRNOTAVAIL;
335         }
336
337         return mmap_cache_get(f->mmap, f->fd, f->prot, context, offset, size, ret);
338 }
339
340 static uint64_t minimum_header_size(Object *o) {
341
342         static uint64_t table[] = {
343                 [OBJECT_DATA] = sizeof(DataObject),
344                 [OBJECT_FIELD] = sizeof(FieldObject),
345                 [OBJECT_ENTRY] = sizeof(EntryObject),
346                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
347                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
348                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
349                 [OBJECT_TAG] = sizeof(TagObject),
350         };
351
352         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
353                 return sizeof(ObjectHeader);
354
355         return table[o->object.type];
356 }
357
358 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
359         int r;
360         void *t;
361         Object *o;
362         uint64_t s;
363         unsigned context;
364
365         assert(f);
366         assert(ret);
367
368         /* Objects may only be located at multiple of 64 bit */
369         if (!VALID64(offset))
370                 return -EFAULT;
371
372         /* One context for each type, plus one catch-all for the rest */
373         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
374
375         r = journal_file_move_to(f, context, offset, sizeof(ObjectHeader), &t);
376         if (r < 0)
377                 return r;
378
379         o = (Object*) t;
380         s = le64toh(o->object.size);
381
382         if (s < sizeof(ObjectHeader))
383                 return -EBADMSG;
384
385         if (o->object.type <= OBJECT_UNUSED)
386                 return -EBADMSG;
387
388         if (s < minimum_header_size(o))
389                 return -EBADMSG;
390
391         if (type >= 0 && o->object.type != type)
392                 return -EBADMSG;
393
394         if (s > sizeof(ObjectHeader)) {
395                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
396                 if (r < 0)
397                         return r;
398
399                 o = (Object*) t;
400         }
401
402         *ret = o;
403         return 0;
404 }
405
406 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
407         uint64_t r;
408
409         assert(f);
410
411         r = le64toh(f->header->tail_entry_seqnum) + 1;
412
413         if (seqnum) {
414                 /* If an external seqnum counter was passed, we update
415                  * both the local and the external one, and set it to
416                  * the maximum of both */
417
418                 if (*seqnum + 1 > r)
419                         r = *seqnum + 1;
420
421                 *seqnum = r;
422         }
423
424         f->header->tail_entry_seqnum = htole64(r);
425
426         if (f->header->head_entry_seqnum == 0)
427                 f->header->head_entry_seqnum = htole64(r);
428
429         return r;
430 }
431
432 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
433         int r;
434         uint64_t p;
435         Object *tail, *o;
436         void *t;
437
438         assert(f);
439         assert(type > 0 && type < _OBJECT_TYPE_MAX);
440         assert(size >= sizeof(ObjectHeader));
441         assert(offset);
442         assert(ret);
443
444         p = le64toh(f->header->tail_object_offset);
445         if (p == 0)
446                 p = le64toh(f->header->header_size);
447         else {
448                 r = journal_file_move_to_object(f, -1, p, &tail);
449                 if (r < 0)
450                         return r;
451
452                 p += ALIGN64(le64toh(tail->object.size));
453         }
454
455         r = journal_file_allocate(f, p, size);
456         if (r < 0)
457                 return r;
458
459         r = journal_file_move_to(f, type, p, size, &t);
460         if (r < 0)
461                 return r;
462
463         o = (Object*) t;
464
465         zero(o->object);
466         o->object.type = type;
467         o->object.size = htole64(size);
468
469         f->header->tail_object_offset = htole64(p);
470         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
471
472         *ret = o;
473         *offset = p;
474
475         return 0;
476 }
477
478 static int journal_file_setup_data_hash_table(JournalFile *f) {
479         uint64_t s, p;
480         Object *o;
481         int r;
482
483         assert(f);
484
485         /* We estimate that we need 1 hash table entry per 768 of
486            journal file and we want to make sure we never get beyond
487            75% fill level. Calculate the hash table size for the
488            maximum file size based on these metrics. */
489
490         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
491         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
492                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
493
494         log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
495
496         r = journal_file_append_object(f,
497                                        OBJECT_DATA_HASH_TABLE,
498                                        offsetof(Object, hash_table.items) + s,
499                                        &o, &p);
500         if (r < 0)
501                 return r;
502
503         memset(o->hash_table.items, 0, s);
504
505         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
506         f->header->data_hash_table_size = htole64(s);
507
508         return 0;
509 }
510
511 static int journal_file_setup_field_hash_table(JournalFile *f) {
512         uint64_t s, p;
513         Object *o;
514         int r;
515
516         assert(f);
517
518         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
519         r = journal_file_append_object(f,
520                                        OBJECT_FIELD_HASH_TABLE,
521                                        offsetof(Object, hash_table.items) + s,
522                                        &o, &p);
523         if (r < 0)
524                 return r;
525
526         memset(o->hash_table.items, 0, s);
527
528         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
529         f->header->field_hash_table_size = htole64(s);
530
531         return 0;
532 }
533
534 static int journal_file_map_data_hash_table(JournalFile *f) {
535         uint64_t s, p;
536         void *t;
537         int r;
538
539         assert(f);
540
541         p = le64toh(f->header->data_hash_table_offset);
542         s = le64toh(f->header->data_hash_table_size);
543
544         r = journal_file_move_to(f,
545                                  OBJECT_DATA_HASH_TABLE,
546                                  p, s,
547                                  &t);
548         if (r < 0)
549                 return r;
550
551         f->data_hash_table = t;
552         return 0;
553 }
554
555 static int journal_file_map_field_hash_table(JournalFile *f) {
556         uint64_t s, p;
557         void *t;
558         int r;
559
560         assert(f);
561
562         p = le64toh(f->header->field_hash_table_offset);
563         s = le64toh(f->header->field_hash_table_size);
564
565         r = journal_file_move_to(f,
566                                  OBJECT_FIELD_HASH_TABLE,
567                                  p, s,
568                                  &t);
569         if (r < 0)
570                 return r;
571
572         f->field_hash_table = t;
573         return 0;
574 }
575
576 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
577         uint64_t p, h;
578         int r;
579
580         assert(f);
581         assert(o);
582         assert(offset > 0);
583         assert(o->object.type == OBJECT_DATA);
584
585         /* This might alter the window we are looking at */
586
587         o->data.next_hash_offset = o->data.next_field_offset = 0;
588         o->data.entry_offset = o->data.entry_array_offset = 0;
589         o->data.n_entries = 0;
590
591         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
592         p = le64toh(f->data_hash_table[h].tail_hash_offset);
593         if (p == 0) {
594                 /* Only entry in the hash table is easy */
595                 f->data_hash_table[h].head_hash_offset = htole64(offset);
596         } else {
597                 /* Move back to the previous data object, to patch in
598                  * pointer */
599
600                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
601                 if (r < 0)
602                         return r;
603
604                 o->data.next_hash_offset = htole64(offset);
605         }
606
607         f->data_hash_table[h].tail_hash_offset = htole64(offset);
608
609         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
610                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
611
612         return 0;
613 }
614
615 int journal_file_find_data_object_with_hash(
616                 JournalFile *f,
617                 const void *data, uint64_t size, uint64_t hash,
618                 Object **ret, uint64_t *offset) {
619
620         uint64_t p, osize, h;
621         int r;
622
623         assert(f);
624         assert(data || size == 0);
625
626         osize = offsetof(Object, data.payload) + size;
627
628         if (f->header->data_hash_table_size == 0)
629                 return -EBADMSG;
630
631         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
632         p = le64toh(f->data_hash_table[h].head_hash_offset);
633
634         while (p > 0) {
635                 Object *o;
636
637                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
638                 if (r < 0)
639                         return r;
640
641                 if (le64toh(o->data.hash) != hash)
642                         goto next;
643
644                 if (o->object.flags & OBJECT_COMPRESSED) {
645 #ifdef HAVE_XZ
646                         uint64_t l, rsize;
647
648                         l = le64toh(o->object.size);
649                         if (l <= offsetof(Object, data.payload))
650                                 return -EBADMSG;
651
652                         l -= offsetof(Object, data.payload);
653
654                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
655                                 return -EBADMSG;
656
657                         if (rsize == size &&
658                             memcmp(f->compress_buffer, data, size) == 0) {
659
660                                 if (ret)
661                                         *ret = o;
662
663                                 if (offset)
664                                         *offset = p;
665
666                                 return 1;
667                         }
668 #else
669                         return -EPROTONOSUPPORT;
670 #endif
671
672                 } else if (le64toh(o->object.size) == osize &&
673                            memcmp(o->data.payload, data, size) == 0) {
674
675                         if (ret)
676                                 *ret = o;
677
678                         if (offset)
679                                 *offset = p;
680
681                         return 1;
682                 }
683
684         next:
685                 p = le64toh(o->data.next_hash_offset);
686         }
687
688         return 0;
689 }
690
691 int journal_file_find_data_object(
692                 JournalFile *f,
693                 const void *data, uint64_t size,
694                 Object **ret, uint64_t *offset) {
695
696         uint64_t hash;
697
698         assert(f);
699         assert(data || size == 0);
700
701         hash = hash64(data, size);
702
703         return journal_file_find_data_object_with_hash(f,
704                                                        data, size, hash,
705                                                        ret, offset);
706 }
707
708 static int journal_file_append_data(
709                 JournalFile *f,
710                 const void *data, uint64_t size,
711                 Object **ret, uint64_t *offset) {
712
713         uint64_t hash, p;
714         uint64_t osize;
715         Object *o;
716         int r;
717         bool compressed = false;
718
719         assert(f);
720         assert(data || size == 0);
721
722         hash = hash64(data, size);
723
724         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
725         if (r < 0)
726                 return r;
727         else if (r > 0) {
728
729                 if (ret)
730                         *ret = o;
731
732                 if (offset)
733                         *offset = p;
734
735                 return 0;
736         }
737
738         osize = offsetof(Object, data.payload) + size;
739         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
740         if (r < 0)
741                 return r;
742
743         o->data.hash = htole64(hash);
744
745 #ifdef HAVE_XZ
746         if (f->compress &&
747             size >= COMPRESSION_SIZE_THRESHOLD) {
748                 uint64_t rsize;
749
750                 compressed = compress_blob(data, size, o->data.payload, &rsize);
751
752                 if (compressed) {
753                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
754                         o->object.flags |= OBJECT_COMPRESSED;
755
756                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
757                 }
758         }
759 #endif
760
761         if (!compressed && size > 0)
762                 memcpy(o->data.payload, data, size);
763
764         r = journal_file_link_data(f, o, p, hash);
765         if (r < 0)
766                 return r;
767
768         r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
769         if (r < 0)
770                 return r;
771
772         /* The linking might have altered the window, so let's
773          * refresh our pointer */
774         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
775         if (r < 0)
776                 return r;
777
778         if (ret)
779                 *ret = o;
780
781         if (offset)
782                 *offset = p;
783
784         return 0;
785 }
786
787 uint64_t journal_file_entry_n_items(Object *o) {
788         assert(o);
789         assert(o->object.type == OBJECT_ENTRY);
790
791         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
792 }
793
794 uint64_t journal_file_entry_array_n_items(Object *o) {
795         assert(o);
796         assert(o->object.type == OBJECT_ENTRY_ARRAY);
797
798         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
799 }
800
801 uint64_t journal_file_hash_table_n_items(Object *o) {
802         assert(o);
803         assert(o->object.type == OBJECT_DATA_HASH_TABLE ||
804                o->object.type == OBJECT_FIELD_HASH_TABLE);
805
806         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
807 }
808
809 static int link_entry_into_array(JournalFile *f,
810                                  le64_t *first,
811                                  le64_t *idx,
812                                  uint64_t p) {
813         int r;
814         uint64_t n = 0, ap = 0, q, i, a, hidx;
815         Object *o;
816
817         assert(f);
818         assert(first);
819         assert(idx);
820         assert(p > 0);
821
822         a = le64toh(*first);
823         i = hidx = le64toh(*idx);
824         while (a > 0) {
825
826                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
827                 if (r < 0)
828                         return r;
829
830                 n = journal_file_entry_array_n_items(o);
831                 if (i < n) {
832                         o->entry_array.items[i] = htole64(p);
833                         *idx = htole64(hidx + 1);
834                         return 0;
835                 }
836
837                 i -= n;
838                 ap = a;
839                 a = le64toh(o->entry_array.next_entry_array_offset);
840         }
841
842         if (hidx > n)
843                 n = (hidx+1) * 2;
844         else
845                 n = n * 2;
846
847         if (n < 4)
848                 n = 4;
849
850         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
851                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
852                                        &o, &q);
853         if (r < 0)
854                 return r;
855
856         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
857         if (r < 0)
858                 return r;
859
860         o->entry_array.items[i] = htole64(p);
861
862         if (ap == 0)
863                 *first = htole64(q);
864         else {
865                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
866                 if (r < 0)
867                         return r;
868
869                 o->entry_array.next_entry_array_offset = htole64(q);
870         }
871
872         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
873                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
874
875         *idx = htole64(hidx + 1);
876
877         return 0;
878 }
879
880 static int link_entry_into_array_plus_one(JournalFile *f,
881                                           le64_t *extra,
882                                           le64_t *first,
883                                           le64_t *idx,
884                                           uint64_t p) {
885
886         int r;
887
888         assert(f);
889         assert(extra);
890         assert(first);
891         assert(idx);
892         assert(p > 0);
893
894         if (*idx == 0)
895                 *extra = htole64(p);
896         else {
897                 le64_t i;
898
899                 i = htole64(le64toh(*idx) - 1);
900                 r = link_entry_into_array(f, first, &i, p);
901                 if (r < 0)
902                         return r;
903         }
904
905         *idx = htole64(le64toh(*idx) + 1);
906         return 0;
907 }
908
909 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
910         uint64_t p;
911         int r;
912         assert(f);
913         assert(o);
914         assert(offset > 0);
915
916         p = le64toh(o->entry.items[i].object_offset);
917         if (p == 0)
918                 return -EINVAL;
919
920         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
921         if (r < 0)
922                 return r;
923
924         return link_entry_into_array_plus_one(f,
925                                               &o->data.entry_offset,
926                                               &o->data.entry_array_offset,
927                                               &o->data.n_entries,
928                                               offset);
929 }
930
931 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
932         uint64_t n, i;
933         int r;
934
935         assert(f);
936         assert(o);
937         assert(offset > 0);
938         assert(o->object.type == OBJECT_ENTRY);
939
940         __sync_synchronize();
941
942         /* Link up the entry itself */
943         r = link_entry_into_array(f,
944                                   &f->header->entry_array_offset,
945                                   &f->header->n_entries,
946                                   offset);
947         if (r < 0)
948                 return r;
949
950         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
951
952         if (f->header->head_entry_realtime == 0)
953                 f->header->head_entry_realtime = o->entry.realtime;
954
955         f->header->tail_entry_realtime = o->entry.realtime;
956         f->header->tail_entry_monotonic = o->entry.monotonic;
957
958         f->tail_entry_monotonic_valid = true;
959
960         /* Link up the items */
961         n = journal_file_entry_n_items(o);
962         for (i = 0; i < n; i++) {
963                 r = journal_file_link_entry_item(f, o, offset, i);
964                 if (r < 0)
965                         return r;
966         }
967
968         return 0;
969 }
970
971 static int journal_file_append_entry_internal(
972                 JournalFile *f,
973                 const dual_timestamp *ts,
974                 uint64_t xor_hash,
975                 const EntryItem items[], unsigned n_items,
976                 uint64_t *seqnum,
977                 Object **ret, uint64_t *offset) {
978         uint64_t np;
979         uint64_t osize;
980         Object *o;
981         int r;
982
983         assert(f);
984         assert(items || n_items == 0);
985         assert(ts);
986
987         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
988
989         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
990         if (r < 0)
991                 return r;
992
993         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
994         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
995         o->entry.realtime = htole64(ts->realtime);
996         o->entry.monotonic = htole64(ts->monotonic);
997         o->entry.xor_hash = htole64(xor_hash);
998         o->entry.boot_id = f->header->boot_id;
999
1000         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
1001         if (r < 0)
1002                 return r;
1003
1004         r = journal_file_link_entry(f, o, np);
1005         if (r < 0)
1006                 return r;
1007
1008         if (ret)
1009                 *ret = o;
1010
1011         if (offset)
1012                 *offset = np;
1013
1014         return 0;
1015 }
1016
1017 void journal_file_post_change(JournalFile *f) {
1018         assert(f);
1019
1020         /* inotify() does not receive IN_MODIFY events from file
1021          * accesses done via mmap(). After each access we hence
1022          * trigger IN_MODIFY by truncating the journal file to its
1023          * current size which triggers IN_MODIFY. */
1024
1025         __sync_synchronize();
1026
1027         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1028                 log_error("Failed to to truncate file to its own size: %m");
1029 }
1030
1031 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1032         unsigned i;
1033         EntryItem *items;
1034         int r;
1035         uint64_t xor_hash = 0;
1036         struct dual_timestamp _ts;
1037
1038         assert(f);
1039         assert(iovec || n_iovec == 0);
1040
1041         if (!f->writable)
1042                 return -EPERM;
1043
1044         if (!ts) {
1045                 dual_timestamp_get(&_ts);
1046                 ts = &_ts;
1047         }
1048
1049         if (f->tail_entry_monotonic_valid &&
1050             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1051                 return -EINVAL;
1052
1053         r = journal_file_maybe_append_tag(f, ts->realtime);
1054         if (r < 0)
1055                 return r;
1056
1057         /* alloca() can't take 0, hence let's allocate at least one */
1058         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1059
1060         for (i = 0; i < n_iovec; i++) {
1061                 uint64_t p;
1062                 Object *o;
1063
1064                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1065                 if (r < 0)
1066                         return r;
1067
1068                 xor_hash ^= le64toh(o->data.hash);
1069                 items[i].object_offset = htole64(p);
1070                 items[i].hash = o->data.hash;
1071         }
1072
1073         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1074
1075         journal_file_post_change(f);
1076
1077         return r;
1078 }
1079
1080 static int generic_array_get(JournalFile *f,
1081                              uint64_t first,
1082                              uint64_t i,
1083                              Object **ret, uint64_t *offset) {
1084
1085         Object *o;
1086         uint64_t p = 0, a;
1087         int r;
1088
1089         assert(f);
1090
1091         a = first;
1092         while (a > 0) {
1093                 uint64_t n;
1094
1095                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1096                 if (r < 0)
1097                         return r;
1098
1099                 n = journal_file_entry_array_n_items(o);
1100                 if (i < n) {
1101                         p = le64toh(o->entry_array.items[i]);
1102                         break;
1103                 }
1104
1105                 i -= n;
1106                 a = le64toh(o->entry_array.next_entry_array_offset);
1107         }
1108
1109         if (a <= 0 || p <= 0)
1110                 return 0;
1111
1112         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1113         if (r < 0)
1114                 return r;
1115
1116         if (ret)
1117                 *ret = o;
1118
1119         if (offset)
1120                 *offset = p;
1121
1122         return 1;
1123 }
1124
1125 static int generic_array_get_plus_one(JournalFile *f,
1126                                       uint64_t extra,
1127                                       uint64_t first,
1128                                       uint64_t i,
1129                                       Object **ret, uint64_t *offset) {
1130
1131         Object *o;
1132
1133         assert(f);
1134
1135         if (i == 0) {
1136                 int r;
1137
1138                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1139                 if (r < 0)
1140                         return r;
1141
1142                 if (ret)
1143                         *ret = o;
1144
1145                 if (offset)
1146                         *offset = extra;
1147
1148                 return 1;
1149         }
1150
1151         return generic_array_get(f, first, i-1, ret, offset);
1152 }
1153
1154 enum {
1155         TEST_FOUND,
1156         TEST_LEFT,
1157         TEST_RIGHT
1158 };
1159
1160 static int generic_array_bisect(JournalFile *f,
1161                                 uint64_t first,
1162                                 uint64_t n,
1163                                 uint64_t needle,
1164                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1165                                 direction_t direction,
1166                                 Object **ret,
1167                                 uint64_t *offset,
1168                                 uint64_t *idx) {
1169
1170         uint64_t a, p, t = 0, i = 0, last_p = 0;
1171         bool subtract_one = false;
1172         Object *o, *array = NULL;
1173         int r;
1174
1175         assert(f);
1176         assert(test_object);
1177
1178         a = first;
1179         while (a > 0) {
1180                 uint64_t left, right, k, lp;
1181
1182                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1183                 if (r < 0)
1184                         return r;
1185
1186                 k = journal_file_entry_array_n_items(array);
1187                 right = MIN(k, n);
1188                 if (right <= 0)
1189                         return 0;
1190
1191                 i = right - 1;
1192                 lp = p = le64toh(array->entry_array.items[i]);
1193                 if (p <= 0)
1194                         return -EBADMSG;
1195
1196                 r = test_object(f, p, needle);
1197                 if (r < 0)
1198                         return r;
1199
1200                 if (r == TEST_FOUND)
1201                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1202
1203                 if (r == TEST_RIGHT) {
1204                         left = 0;
1205                         right -= 1;
1206                         for (;;) {
1207                                 if (left == right) {
1208                                         if (direction == DIRECTION_UP)
1209                                                 subtract_one = true;
1210
1211                                         i = left;
1212                                         goto found;
1213                                 }
1214
1215                                 assert(left < right);
1216
1217                                 i = (left + right) / 2;
1218                                 p = le64toh(array->entry_array.items[i]);
1219                                 if (p <= 0)
1220                                         return -EBADMSG;
1221
1222                                 r = test_object(f, p, needle);
1223                                 if (r < 0)
1224                                         return r;
1225
1226                                 if (r == TEST_FOUND)
1227                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1228
1229                                 if (r == TEST_RIGHT)
1230                                         right = i;
1231                                 else
1232                                         left = i + 1;
1233                         }
1234                 }
1235
1236                 if (k > n) {
1237                         if (direction == DIRECTION_UP) {
1238                                 i = n;
1239                                 subtract_one = true;
1240                                 goto found;
1241                         }
1242
1243                         return 0;
1244                 }
1245
1246                 last_p = lp;
1247
1248                 n -= k;
1249                 t += k;
1250                 a = le64toh(array->entry_array.next_entry_array_offset);
1251         }
1252
1253         return 0;
1254
1255 found:
1256         if (subtract_one && t == 0 && i == 0)
1257                 return 0;
1258
1259         if (subtract_one && i == 0)
1260                 p = last_p;
1261         else if (subtract_one)
1262                 p = le64toh(array->entry_array.items[i-1]);
1263         else
1264                 p = le64toh(array->entry_array.items[i]);
1265
1266         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1267         if (r < 0)
1268                 return r;
1269
1270         if (ret)
1271                 *ret = o;
1272
1273         if (offset)
1274                 *offset = p;
1275
1276         if (idx)
1277                 *idx = t + i + (subtract_one ? -1 : 0);
1278
1279         return 1;
1280 }
1281
1282 static int generic_array_bisect_plus_one(JournalFile *f,
1283                                          uint64_t extra,
1284                                          uint64_t first,
1285                                          uint64_t n,
1286                                          uint64_t needle,
1287                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1288                                          direction_t direction,
1289                                          Object **ret,
1290                                          uint64_t *offset,
1291                                          uint64_t *idx) {
1292
1293         int r;
1294         bool step_back = false;
1295         Object *o;
1296
1297         assert(f);
1298         assert(test_object);
1299
1300         if (n <= 0)
1301                 return 0;
1302
1303         /* This bisects the array in object 'first', but first checks
1304          * an extra  */
1305         r = test_object(f, extra, needle);
1306         if (r < 0)
1307                 return r;
1308
1309         if (r == TEST_FOUND)
1310                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1311
1312         /* if we are looking with DIRECTION_UP then we need to first
1313            see if in the actual array there is a matching entry, and
1314            return the last one of that. But if there isn't any we need
1315            to return this one. Hence remember this, and return it
1316            below. */
1317         if (r == TEST_LEFT)
1318                 step_back = direction == DIRECTION_UP;
1319
1320         if (r == TEST_RIGHT) {
1321                 if (direction == DIRECTION_DOWN)
1322                         goto found;
1323                 else
1324                         return 0;
1325         }
1326
1327         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1328
1329         if (r == 0 && step_back)
1330                 goto found;
1331
1332         if (r > 0 && idx)
1333                 (*idx) ++;
1334
1335         return r;
1336
1337 found:
1338         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1339         if (r < 0)
1340                 return r;
1341
1342         if (ret)
1343                 *ret = o;
1344
1345         if (offset)
1346                 *offset = extra;
1347
1348         if (idx)
1349                 *idx = 0;
1350
1351         return 1;
1352 }
1353
1354 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1355         assert(f);
1356         assert(p > 0);
1357
1358         if (p == needle)
1359                 return TEST_FOUND;
1360         else if (p < needle)
1361                 return TEST_LEFT;
1362         else
1363                 return TEST_RIGHT;
1364 }
1365
1366 int journal_file_move_to_entry_by_offset(
1367                 JournalFile *f,
1368                 uint64_t p,
1369                 direction_t direction,
1370                 Object **ret,
1371                 uint64_t *offset) {
1372
1373         return generic_array_bisect(f,
1374                                     le64toh(f->header->entry_array_offset),
1375                                     le64toh(f->header->n_entries),
1376                                     p,
1377                                     test_object_offset,
1378                                     direction,
1379                                     ret, offset, NULL);
1380 }
1381
1382
1383 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1384         Object *o;
1385         int r;
1386
1387         assert(f);
1388         assert(p > 0);
1389
1390         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1391         if (r < 0)
1392                 return r;
1393
1394         if (le64toh(o->entry.seqnum) == needle)
1395                 return TEST_FOUND;
1396         else if (le64toh(o->entry.seqnum) < needle)
1397                 return TEST_LEFT;
1398         else
1399                 return TEST_RIGHT;
1400 }
1401
1402 int journal_file_move_to_entry_by_seqnum(
1403                 JournalFile *f,
1404                 uint64_t seqnum,
1405                 direction_t direction,
1406                 Object **ret,
1407                 uint64_t *offset) {
1408
1409         return generic_array_bisect(f,
1410                                     le64toh(f->header->entry_array_offset),
1411                                     le64toh(f->header->n_entries),
1412                                     seqnum,
1413                                     test_object_seqnum,
1414                                     direction,
1415                                     ret, offset, NULL);
1416 }
1417
1418 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1419         Object *o;
1420         int r;
1421
1422         assert(f);
1423         assert(p > 0);
1424
1425         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1426         if (r < 0)
1427                 return r;
1428
1429         if (le64toh(o->entry.realtime) == needle)
1430                 return TEST_FOUND;
1431         else if (le64toh(o->entry.realtime) < needle)
1432                 return TEST_LEFT;
1433         else
1434                 return TEST_RIGHT;
1435 }
1436
1437 int journal_file_move_to_entry_by_realtime(
1438                 JournalFile *f,
1439                 uint64_t realtime,
1440                 direction_t direction,
1441                 Object **ret,
1442                 uint64_t *offset) {
1443
1444         return generic_array_bisect(f,
1445                                     le64toh(f->header->entry_array_offset),
1446                                     le64toh(f->header->n_entries),
1447                                     realtime,
1448                                     test_object_realtime,
1449                                     direction,
1450                                     ret, offset, NULL);
1451 }
1452
1453 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1454         Object *o;
1455         int r;
1456
1457         assert(f);
1458         assert(p > 0);
1459
1460         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1461         if (r < 0)
1462                 return r;
1463
1464         if (le64toh(o->entry.monotonic) == needle)
1465                 return TEST_FOUND;
1466         else if (le64toh(o->entry.monotonic) < needle)
1467                 return TEST_LEFT;
1468         else
1469                 return TEST_RIGHT;
1470 }
1471
1472 int journal_file_move_to_entry_by_monotonic(
1473                 JournalFile *f,
1474                 sd_id128_t boot_id,
1475                 uint64_t monotonic,
1476                 direction_t direction,
1477                 Object **ret,
1478                 uint64_t *offset) {
1479
1480         char t[9+32+1] = "_BOOT_ID=";
1481         Object *o;
1482         int r;
1483
1484         assert(f);
1485
1486         sd_id128_to_string(boot_id, t + 9);
1487         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1488         if (r < 0)
1489                 return r;
1490         if (r == 0)
1491                 return -ENOENT;
1492
1493         return generic_array_bisect_plus_one(f,
1494                                              le64toh(o->data.entry_offset),
1495                                              le64toh(o->data.entry_array_offset),
1496                                              le64toh(o->data.n_entries),
1497                                              monotonic,
1498                                              test_object_monotonic,
1499                                              direction,
1500                                              ret, offset, NULL);
1501 }
1502
1503 int journal_file_next_entry(
1504                 JournalFile *f,
1505                 Object *o, uint64_t p,
1506                 direction_t direction,
1507                 Object **ret, uint64_t *offset) {
1508
1509         uint64_t i, n;
1510         int r;
1511
1512         assert(f);
1513         assert(p > 0 || !o);
1514
1515         n = le64toh(f->header->n_entries);
1516         if (n <= 0)
1517                 return 0;
1518
1519         if (!o)
1520                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1521         else {
1522                 if (o->object.type != OBJECT_ENTRY)
1523                         return -EINVAL;
1524
1525                 r = generic_array_bisect(f,
1526                                          le64toh(f->header->entry_array_offset),
1527                                          le64toh(f->header->n_entries),
1528                                          p,
1529                                          test_object_offset,
1530                                          DIRECTION_DOWN,
1531                                          NULL, NULL,
1532                                          &i);
1533                 if (r <= 0)
1534                         return r;
1535
1536                 if (direction == DIRECTION_DOWN) {
1537                         if (i >= n - 1)
1538                                 return 0;
1539
1540                         i++;
1541                 } else {
1542                         if (i <= 0)
1543                                 return 0;
1544
1545                         i--;
1546                 }
1547         }
1548
1549         /* And jump to it */
1550         return generic_array_get(f,
1551                                  le64toh(f->header->entry_array_offset),
1552                                  i,
1553                                  ret, offset);
1554 }
1555
1556 int journal_file_skip_entry(
1557                 JournalFile *f,
1558                 Object *o, uint64_t p,
1559                 int64_t skip,
1560                 Object **ret, uint64_t *offset) {
1561
1562         uint64_t i, n;
1563         int r;
1564
1565         assert(f);
1566         assert(o);
1567         assert(p > 0);
1568
1569         if (o->object.type != OBJECT_ENTRY)
1570                 return -EINVAL;
1571
1572         r = generic_array_bisect(f,
1573                                  le64toh(f->header->entry_array_offset),
1574                                  le64toh(f->header->n_entries),
1575                                  p,
1576                                  test_object_offset,
1577                                  DIRECTION_DOWN,
1578                                  NULL, NULL,
1579                                  &i);
1580         if (r <= 0)
1581                 return r;
1582
1583         /* Calculate new index */
1584         if (skip < 0) {
1585                 if ((uint64_t) -skip >= i)
1586                         i = 0;
1587                 else
1588                         i = i - (uint64_t) -skip;
1589         } else
1590                 i  += (uint64_t) skip;
1591
1592         n = le64toh(f->header->n_entries);
1593         if (n <= 0)
1594                 return -EBADMSG;
1595
1596         if (i >= n)
1597                 i = n-1;
1598
1599         return generic_array_get(f,
1600                                  le64toh(f->header->entry_array_offset),
1601                                  i,
1602                                  ret, offset);
1603 }
1604
1605 int journal_file_next_entry_for_data(
1606                 JournalFile *f,
1607                 Object *o, uint64_t p,
1608                 uint64_t data_offset,
1609                 direction_t direction,
1610                 Object **ret, uint64_t *offset) {
1611
1612         uint64_t n, i;
1613         int r;
1614         Object *d;
1615
1616         assert(f);
1617         assert(p > 0 || !o);
1618
1619         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1620         if (r < 0)
1621                 return r;
1622
1623         n = le64toh(d->data.n_entries);
1624         if (n <= 0)
1625                 return n;
1626
1627         if (!o)
1628                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1629         else {
1630                 if (o->object.type != OBJECT_ENTRY)
1631                         return -EINVAL;
1632
1633                 r = generic_array_bisect_plus_one(f,
1634                                                   le64toh(d->data.entry_offset),
1635                                                   le64toh(d->data.entry_array_offset),
1636                                                   le64toh(d->data.n_entries),
1637                                                   p,
1638                                                   test_object_offset,
1639                                                   DIRECTION_DOWN,
1640                                                   NULL, NULL,
1641                                                   &i);
1642
1643                 if (r <= 0)
1644                         return r;
1645
1646                 if (direction == DIRECTION_DOWN) {
1647                         if (i >= n - 1)
1648                                 return 0;
1649
1650                         i++;
1651                 } else {
1652                         if (i <= 0)
1653                                 return 0;
1654
1655                         i--;
1656                 }
1657
1658         }
1659
1660         return generic_array_get_plus_one(f,
1661                                           le64toh(d->data.entry_offset),
1662                                           le64toh(d->data.entry_array_offset),
1663                                           i,
1664                                           ret, offset);
1665 }
1666
1667 int journal_file_move_to_entry_by_offset_for_data(
1668                 JournalFile *f,
1669                 uint64_t data_offset,
1670                 uint64_t p,
1671                 direction_t direction,
1672                 Object **ret, uint64_t *offset) {
1673
1674         int r;
1675         Object *d;
1676
1677         assert(f);
1678
1679         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1680         if (r < 0)
1681                 return r;
1682
1683         return generic_array_bisect_plus_one(f,
1684                                              le64toh(d->data.entry_offset),
1685                                              le64toh(d->data.entry_array_offset),
1686                                              le64toh(d->data.n_entries),
1687                                              p,
1688                                              test_object_offset,
1689                                              direction,
1690                                              ret, offset, NULL);
1691 }
1692
1693 int journal_file_move_to_entry_by_monotonic_for_data(
1694                 JournalFile *f,
1695                 uint64_t data_offset,
1696                 sd_id128_t boot_id,
1697                 uint64_t monotonic,
1698                 direction_t direction,
1699                 Object **ret, uint64_t *offset) {
1700
1701         char t[9+32+1] = "_BOOT_ID=";
1702         Object *o, *d;
1703         int r;
1704         uint64_t b, z;
1705
1706         assert(f);
1707
1708         /* First, seek by time */
1709         sd_id128_to_string(boot_id, t + 9);
1710         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1711         if (r < 0)
1712                 return r;
1713         if (r == 0)
1714                 return -ENOENT;
1715
1716         r = generic_array_bisect_plus_one(f,
1717                                           le64toh(o->data.entry_offset),
1718                                           le64toh(o->data.entry_array_offset),
1719                                           le64toh(o->data.n_entries),
1720                                           monotonic,
1721                                           test_object_monotonic,
1722                                           direction,
1723                                           NULL, &z, NULL);
1724         if (r <= 0)
1725                 return r;
1726
1727         /* And now, continue seeking until we find an entry that
1728          * exists in both bisection arrays */
1729
1730         for (;;) {
1731                 Object *qo;
1732                 uint64_t p, q;
1733
1734                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1735                 if (r < 0)
1736                         return r;
1737
1738                 r = generic_array_bisect_plus_one(f,
1739                                                   le64toh(d->data.entry_offset),
1740                                                   le64toh(d->data.entry_array_offset),
1741                                                   le64toh(d->data.n_entries),
1742                                                   z,
1743                                                   test_object_offset,
1744                                                   direction,
1745                                                   NULL, &p, NULL);
1746                 if (r <= 0)
1747                         return r;
1748
1749                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1750                 if (r < 0)
1751                         return r;
1752
1753                 r = generic_array_bisect_plus_one(f,
1754                                                   le64toh(o->data.entry_offset),
1755                                                   le64toh(o->data.entry_array_offset),
1756                                                   le64toh(o->data.n_entries),
1757                                                   p,
1758                                                   test_object_offset,
1759                                                   direction,
1760                                                   &qo, &q, NULL);
1761
1762                 if (r <= 0)
1763                         return r;
1764
1765                 if (p == q) {
1766                         if (ret)
1767                                 *ret = qo;
1768                         if (offset)
1769                                 *offset = q;
1770
1771                         return 1;
1772                 }
1773
1774                 z = q;
1775         }
1776
1777         return 0;
1778 }
1779
1780 int journal_file_move_to_entry_by_seqnum_for_data(
1781                 JournalFile *f,
1782                 uint64_t data_offset,
1783                 uint64_t seqnum,
1784                 direction_t direction,
1785                 Object **ret, uint64_t *offset) {
1786
1787         Object *d;
1788         int r;
1789
1790         assert(f);
1791
1792         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1793         if (r < 0)
1794                 return r;
1795
1796         return generic_array_bisect_plus_one(f,
1797                                              le64toh(d->data.entry_offset),
1798                                              le64toh(d->data.entry_array_offset),
1799                                              le64toh(d->data.n_entries),
1800                                              seqnum,
1801                                              test_object_seqnum,
1802                                              direction,
1803                                              ret, offset, NULL);
1804 }
1805
1806 int journal_file_move_to_entry_by_realtime_for_data(
1807                 JournalFile *f,
1808                 uint64_t data_offset,
1809                 uint64_t realtime,
1810                 direction_t direction,
1811                 Object **ret, uint64_t *offset) {
1812
1813         Object *d;
1814         int r;
1815
1816         assert(f);
1817
1818         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1819         if (r < 0)
1820                 return r;
1821
1822         return generic_array_bisect_plus_one(f,
1823                                              le64toh(d->data.entry_offset),
1824                                              le64toh(d->data.entry_array_offset),
1825                                              le64toh(d->data.n_entries),
1826                                              realtime,
1827                                              test_object_realtime,
1828                                              direction,
1829                                              ret, offset, NULL);
1830 }
1831
1832 void journal_file_dump(JournalFile *f) {
1833         Object *o;
1834         int r;
1835         uint64_t p;
1836
1837         assert(f);
1838
1839         journal_file_print_header(f);
1840
1841         p = le64toh(f->header->header_size);
1842         while (p != 0) {
1843                 r = journal_file_move_to_object(f, -1, p, &o);
1844                 if (r < 0)
1845                         goto fail;
1846
1847                 switch (o->object.type) {
1848
1849                 case OBJECT_UNUSED:
1850                         printf("Type: OBJECT_UNUSED\n");
1851                         break;
1852
1853                 case OBJECT_DATA:
1854                         printf("Type: OBJECT_DATA\n");
1855                         break;
1856
1857                 case OBJECT_ENTRY:
1858                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1859                                (unsigned long long) le64toh(o->entry.seqnum),
1860                                (unsigned long long) le64toh(o->entry.monotonic),
1861                                (unsigned long long) le64toh(o->entry.realtime));
1862                         break;
1863
1864                 case OBJECT_FIELD_HASH_TABLE:
1865                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1866                         break;
1867
1868                 case OBJECT_DATA_HASH_TABLE:
1869                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1870                         break;
1871
1872                 case OBJECT_ENTRY_ARRAY:
1873                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1874                         break;
1875
1876                 case OBJECT_TAG:
1877                         printf("Type: OBJECT_TAG %llu\n",
1878                                (unsigned long long) le64toh(o->tag.seqnum));
1879                         break;
1880                 }
1881
1882                 if (o->object.flags & OBJECT_COMPRESSED)
1883                         printf("Flags: COMPRESSED\n");
1884
1885                 if (p == le64toh(f->header->tail_object_offset))
1886                         p = 0;
1887                 else
1888                         p = p + ALIGN64(le64toh(o->object.size));
1889         }
1890
1891         return;
1892 fail:
1893         log_error("File corrupt");
1894 }
1895
1896 void journal_file_print_header(JournalFile *f) {
1897         char a[33], b[33], c[33];
1898         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1899
1900         assert(f);
1901
1902         printf("File Path: %s\n"
1903                "File ID: %s\n"
1904                "Machine ID: %s\n"
1905                "Boot ID: %s\n"
1906                "Sequential Number ID: %s\n"
1907                "State: %s\n"
1908                "Compatible Flags:%s%s\n"
1909                "Incompatible Flags:%s%s\n"
1910                "Header size: %llu\n"
1911                "Arena size: %llu\n"
1912                "Data Hash Table Size: %llu\n"
1913                "Field Hash Table Size: %llu\n"
1914                "Rotate Suggested: %s\n"
1915                "Head Sequential Number: %llu\n"
1916                "Tail Sequential Number: %llu\n"
1917                "Head Realtime Timestamp: %s\n"
1918                "Tail Realtime Timestamp: %s\n"
1919                "Objects: %llu\n"
1920                "Entry Objects: %llu\n",
1921                f->path,
1922                sd_id128_to_string(f->header->file_id, a),
1923                sd_id128_to_string(f->header->machine_id, b),
1924                sd_id128_to_string(f->header->boot_id, c),
1925                sd_id128_to_string(f->header->seqnum_id, c),
1926                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1927                f->header->state == STATE_ONLINE ? "ONLINE" :
1928                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1929                (f->header->compatible_flags & HEADER_COMPATIBLE_SEALED) ? " SEALED" : "",
1930                (f->header->compatible_flags & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1931                (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
1932                (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1933                (unsigned long long) le64toh(f->header->header_size),
1934                (unsigned long long) le64toh(f->header->arena_size),
1935                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1936                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1937                yes_no(journal_file_rotate_suggested(f)),
1938                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1939                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1940                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1941                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1942                (unsigned long long) le64toh(f->header->n_objects),
1943                (unsigned long long) le64toh(f->header->n_entries));
1944
1945         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1946                 printf("Data Objects: %llu\n"
1947                        "Data Hash Table Fill: %.1f%%\n",
1948                        (unsigned long long) le64toh(f->header->n_data),
1949                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1950
1951         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1952                 printf("Field Objects: %llu\n"
1953                        "Field Hash Table Fill: %.1f%%\n",
1954                        (unsigned long long) le64toh(f->header->n_fields),
1955                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1956
1957         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
1958                 printf("Tag Objects: %llu\n",
1959                        (unsigned long long) le64toh(f->header->n_tags));
1960         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1961                 printf("Entry Array Objects: %llu\n",
1962                        (unsigned long long) le64toh(f->header->n_entry_arrays));
1963 }
1964
1965 int journal_file_open(
1966                 const char *fname,
1967                 int flags,
1968                 mode_t mode,
1969                 bool compress,
1970                 bool seal,
1971                 JournalMetrics *metrics,
1972                 MMapCache *mmap_cache,
1973                 JournalFile *template,
1974                 JournalFile **ret) {
1975
1976         JournalFile *f;
1977         int r;
1978         bool newly_created = false;
1979
1980         assert(fname);
1981
1982         if ((flags & O_ACCMODE) != O_RDONLY &&
1983             (flags & O_ACCMODE) != O_RDWR)
1984                 return -EINVAL;
1985
1986         if (!endswith(fname, ".journal") &&
1987             !endswith(fname, ".journal~"))
1988                 return -EINVAL;
1989
1990         f = new0(JournalFile, 1);
1991         if (!f)
1992                 return -ENOMEM;
1993
1994         f->fd = -1;
1995         f->mode = mode;
1996
1997         f->flags = flags;
1998         f->prot = prot_from_flags(flags);
1999         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2000         f->compress = compress;
2001         f->seal = seal;
2002
2003         if (mmap_cache)
2004                 f->mmap = mmap_cache_ref(mmap_cache);
2005         else {
2006                 f->mmap = mmap_cache_new();
2007                 if (!f->mmap) {
2008                         r = -ENOMEM;
2009                         goto fail;
2010                 }
2011         }
2012
2013         f->path = strdup(fname);
2014         if (!f->path) {
2015                 r = -ENOMEM;
2016                 goto fail;
2017         }
2018
2019         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2020         if (f->fd < 0) {
2021                 r = -errno;
2022                 goto fail;
2023         }
2024
2025         if (fstat(f->fd, &f->last_stat) < 0) {
2026                 r = -errno;
2027                 goto fail;
2028         }
2029
2030         if (f->last_stat.st_size == 0 && f->writable) {
2031                 newly_created = true;
2032
2033                 /* Try to load the FSPRG state, and if we can't, then
2034                  * just don't do sealing */
2035                 r = journal_file_fss_load(f);
2036                 if (r < 0)
2037                         f->seal = false;
2038
2039                 r = journal_file_init_header(f, template);
2040                 if (r < 0)
2041                         goto fail;
2042
2043                 if (fstat(f->fd, &f->last_stat) < 0) {
2044                         r = -errno;
2045                         goto fail;
2046                 }
2047         }
2048
2049         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2050                 r = -EIO;
2051                 goto fail;
2052         }
2053
2054         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2055         if (f->header == MAP_FAILED) {
2056                 f->header = NULL;
2057                 r = -errno;
2058                 goto fail;
2059         }
2060
2061         if (!newly_created) {
2062                 r = journal_file_verify_header(f);
2063                 if (r < 0)
2064                         goto fail;
2065         }
2066
2067         if (!newly_created && f->writable) {
2068                 r = journal_file_fss_load(f);
2069                 if (r < 0)
2070                         goto fail;
2071         }
2072
2073         if (f->writable) {
2074                 if (metrics) {
2075                         journal_default_metrics(metrics, f->fd);
2076                         f->metrics = *metrics;
2077                 } else if (template)
2078                         f->metrics = template->metrics;
2079
2080                 r = journal_file_refresh_header(f);
2081                 if (r < 0)
2082                         goto fail;
2083         }
2084
2085         r = journal_file_hmac_setup(f);
2086         if (r < 0)
2087                 goto fail;
2088
2089         if (newly_created) {
2090                 r = journal_file_setup_field_hash_table(f);
2091                 if (r < 0)
2092                         goto fail;
2093
2094                 r = journal_file_setup_data_hash_table(f);
2095                 if (r < 0)
2096                         goto fail;
2097
2098                 r = journal_file_append_first_tag(f);
2099                 if (r < 0)
2100                         goto fail;
2101         }
2102
2103         r = journal_file_map_field_hash_table(f);
2104         if (r < 0)
2105                 goto fail;
2106
2107         r = journal_file_map_data_hash_table(f);
2108         if (r < 0)
2109                 goto fail;
2110
2111         if (ret)
2112                 *ret = f;
2113
2114         return 0;
2115
2116 fail:
2117         journal_file_close(f);
2118
2119         return r;
2120 }
2121
2122 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2123         char *p;
2124         size_t l;
2125         JournalFile *old_file, *new_file = NULL;
2126         int r;
2127
2128         assert(f);
2129         assert(*f);
2130
2131         old_file = *f;
2132
2133         if (!old_file->writable)
2134                 return -EINVAL;
2135
2136         if (!endswith(old_file->path, ".journal"))
2137                 return -EINVAL;
2138
2139         l = strlen(old_file->path);
2140
2141         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2142         if (!p)
2143                 return -ENOMEM;
2144
2145         memcpy(p, old_file->path, l - 8);
2146         p[l-8] = '@';
2147         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2148         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2149                  "-%016llx-%016llx.journal",
2150                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2151                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2152
2153         r = rename(old_file->path, p);
2154         free(p);
2155
2156         if (r < 0)
2157                 return -errno;
2158
2159         old_file->header->state = STATE_ARCHIVED;
2160
2161         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2162         journal_file_close(old_file);
2163
2164         *f = new_file;
2165         return r;
2166 }
2167
2168 int journal_file_open_reliably(
2169                 const char *fname,
2170                 int flags,
2171                 mode_t mode,
2172                 bool compress,
2173                 bool seal,
2174                 JournalMetrics *metrics,
2175                 MMapCache *mmap_cache,
2176                 JournalFile *template,
2177                 JournalFile **ret) {
2178
2179         int r;
2180         size_t l;
2181         char *p;
2182
2183         r = journal_file_open(fname, flags, mode, compress, seal,
2184                               metrics, mmap_cache, template, ret);
2185         if (r != -EBADMSG && /* corrupted */
2186             r != -ENODATA && /* truncated */
2187             r != -EHOSTDOWN && /* other machine */
2188             r != -EPROTONOSUPPORT && /* incompatible feature */
2189             r != -EBUSY && /* unclean shutdown */
2190             r != -ESHUTDOWN /* already archived */)
2191                 return r;
2192
2193         if ((flags & O_ACCMODE) == O_RDONLY)
2194                 return r;
2195
2196         if (!(flags & O_CREAT))
2197                 return r;
2198
2199         if (!endswith(fname, ".journal"))
2200                 return r;
2201
2202         /* The file is corrupted. Rotate it away and try it again (but only once) */
2203
2204         l = strlen(fname);
2205         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2206                      (int) (l-8), fname,
2207                      (unsigned long long) now(CLOCK_REALTIME),
2208                      random_ull()) < 0)
2209                 return -ENOMEM;
2210
2211         r = rename(fname, p);
2212         free(p);
2213         if (r < 0)
2214                 return -errno;
2215
2216         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2217
2218         return journal_file_open(fname, flags, mode, compress, seal,
2219                                  metrics, mmap_cache, template, ret);
2220 }
2221
2222
2223 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2224         uint64_t i, n;
2225         uint64_t q, xor_hash = 0;
2226         int r;
2227         EntryItem *items;
2228         dual_timestamp ts;
2229
2230         assert(from);
2231         assert(to);
2232         assert(o);
2233         assert(p);
2234
2235         if (!to->writable)
2236                 return -EPERM;
2237
2238         ts.monotonic = le64toh(o->entry.monotonic);
2239         ts.realtime = le64toh(o->entry.realtime);
2240
2241         if (to->tail_entry_monotonic_valid &&
2242             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2243                 return -EINVAL;
2244
2245         n = journal_file_entry_n_items(o);
2246         items = alloca(sizeof(EntryItem) * n);
2247
2248         for (i = 0; i < n; i++) {
2249                 uint64_t l, h;
2250                 le64_t le_hash;
2251                 size_t t;
2252                 void *data;
2253                 Object *u;
2254
2255                 q = le64toh(o->entry.items[i].object_offset);
2256                 le_hash = o->entry.items[i].hash;
2257
2258                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2259                 if (r < 0)
2260                         return r;
2261
2262                 if (le_hash != o->data.hash)
2263                         return -EBADMSG;
2264
2265                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2266                 t = (size_t) l;
2267
2268                 /* We hit the limit on 32bit machines */
2269                 if ((uint64_t) t != l)
2270                         return -E2BIG;
2271
2272                 if (o->object.flags & OBJECT_COMPRESSED) {
2273 #ifdef HAVE_XZ
2274                         uint64_t rsize;
2275
2276                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2277                                 return -EBADMSG;
2278
2279                         data = from->compress_buffer;
2280                         l = rsize;
2281 #else
2282                         return -EPROTONOSUPPORT;
2283 #endif
2284                 } else
2285                         data = o->data.payload;
2286
2287                 r = journal_file_append_data(to, data, l, &u, &h);
2288                 if (r < 0)
2289                         return r;
2290
2291                 xor_hash ^= le64toh(u->data.hash);
2292                 items[i].object_offset = htole64(h);
2293                 items[i].hash = u->data.hash;
2294
2295                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2296                 if (r < 0)
2297                         return r;
2298         }
2299
2300         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2301 }
2302
2303 void journal_default_metrics(JournalMetrics *m, int fd) {
2304         uint64_t fs_size = 0;
2305         struct statvfs ss;
2306         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2307
2308         assert(m);
2309         assert(fd >= 0);
2310
2311         if (fstatvfs(fd, &ss) >= 0)
2312                 fs_size = ss.f_frsize * ss.f_blocks;
2313
2314         if (m->max_use == (uint64_t) -1) {
2315
2316                 if (fs_size > 0) {
2317                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2318
2319                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2320                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2321
2322                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2323                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2324                 } else
2325                         m->max_use = DEFAULT_MAX_USE_LOWER;
2326         } else {
2327                 m->max_use = PAGE_ALIGN(m->max_use);
2328
2329                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2330                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2331         }
2332
2333         if (m->max_size == (uint64_t) -1) {
2334                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2335
2336                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2337                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2338         } else
2339                 m->max_size = PAGE_ALIGN(m->max_size);
2340
2341         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2342                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2343
2344         if (m->max_size*2 > m->max_use)
2345                 m->max_use = m->max_size*2;
2346
2347         if (m->min_size == (uint64_t) -1)
2348                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2349         else {
2350                 m->min_size = PAGE_ALIGN(m->min_size);
2351
2352                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2353                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2354
2355                 if (m->min_size > m->max_size)
2356                         m->max_size = m->min_size;
2357         }
2358
2359         if (m->keep_free == (uint64_t) -1) {
2360
2361                 if (fs_size > 0) {
2362                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2363
2364                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2365                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2366
2367                 } else
2368                         m->keep_free = DEFAULT_KEEP_FREE;
2369         }
2370
2371         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2372                  format_bytes(a, sizeof(a), m->max_use),
2373                  format_bytes(b, sizeof(b), m->max_size),
2374                  format_bytes(c, sizeof(c), m->min_size),
2375                  format_bytes(d, sizeof(d), m->keep_free));
2376 }
2377
2378 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2379         assert(f);
2380         assert(from || to);
2381
2382         if (from) {
2383                 if (f->header->head_entry_realtime == 0)
2384                         return -ENOENT;
2385
2386                 *from = le64toh(f->header->head_entry_realtime);
2387         }
2388
2389         if (to) {
2390                 if (f->header->tail_entry_realtime == 0)
2391                         return -ENOENT;
2392
2393                 *to = le64toh(f->header->tail_entry_realtime);
2394         }
2395
2396         return 1;
2397 }
2398
2399 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2400         char t[9+32+1] = "_BOOT_ID=";
2401         Object *o;
2402         uint64_t p;
2403         int r;
2404
2405         assert(f);
2406         assert(from || to);
2407
2408         sd_id128_to_string(boot_id, t + 9);
2409
2410         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2411         if (r <= 0)
2412                 return r;
2413
2414         if (le64toh(o->data.n_entries) <= 0)
2415                 return 0;
2416
2417         if (from) {
2418                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2419                 if (r < 0)
2420                         return r;
2421
2422                 *from = le64toh(o->entry.monotonic);
2423         }
2424
2425         if (to) {
2426                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2427                 if (r < 0)
2428                         return r;
2429
2430                 r = generic_array_get_plus_one(f,
2431                                                le64toh(o->data.entry_offset),
2432                                                le64toh(o->data.entry_array_offset),
2433                                                le64toh(o->data.n_entries)-1,
2434                                                &o, NULL);
2435                 if (r <= 0)
2436                         return r;
2437
2438                 *to = le64toh(o->entry.monotonic);
2439         }
2440
2441         return 1;
2442 }
2443
2444 bool journal_file_rotate_suggested(JournalFile *f) {
2445         assert(f);
2446
2447         /* If we gained new header fields we gained new features,
2448          * hence suggest a rotation */
2449         if (le64toh(f->header->header_size) < sizeof(Header)) {
2450                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2451                 return true;
2452         }
2453
2454         /* Let's check if the hash tables grew over a certain fill
2455          * level (75%, borrowing this value from Java's hash table
2456          * implementation), and if so suggest a rotation. To calculate
2457          * the fill level we need the n_data field, which only exists
2458          * in newer versions. */
2459
2460         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2461                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2462                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2463                                   f->path,
2464                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2465                                   (unsigned long long) le64toh(f->header->n_data),
2466                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2467                                   (unsigned long long) (f->last_stat.st_size),
2468                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2469                         return true;
2470                 }
2471
2472         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2473                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2474                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2475                                   f->path,
2476                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2477                                   (unsigned long long) le64toh(f->header->n_fields),
2478                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2479                         return true;
2480                 }
2481
2482         return false;
2483 }