chiark / gitweb /
journal: be more careful when keeping around mmaps we still need
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
33 #include "lookup3.h"
34 #include "compress.h"
35 #include "fsprg.h"
36
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46  * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54  * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58  * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
60
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63
64 void journal_file_close(JournalFile *f) {
65         assert(f);
66
67 #ifdef HAVE_GCRYPT
68         /* Write the final tag */
69         if (f->seal && f->writable)
70                 journal_file_append_tag(f);
71 #endif
72
73         /* Sync everything to disk, before we mark the file offline */
74         if (f->mmap && f->fd >= 0)
75                 mmap_cache_close_fd(f->mmap, f->fd);
76
77         if (f->writable && f->fd >= 0)
78                 fdatasync(f->fd);
79
80         if (f->header) {
81                 /* Mark the file offline. Don't override the archived state if it already is set */
82                 if (f->writable && f->header->state == STATE_ONLINE)
83                         f->header->state = STATE_OFFLINE;
84
85                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
86         }
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         free(f->path);
92
93         if (f->mmap)
94                 mmap_cache_unref(f->mmap);
95
96 #ifdef HAVE_XZ
97         free(f->compress_buffer);
98 #endif
99
100 #ifdef HAVE_GCRYPT
101         if (f->fss_file)
102                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
103         else if (f->fsprg_state)
104                 free(f->fsprg_state);
105
106         free(f->fsprg_seed);
107
108         if (f->hmac)
109                 gcry_md_close(f->hmac);
110 #endif
111
112         free(f);
113 }
114
115 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
116         Header h;
117         ssize_t k;
118         int r;
119
120         assert(f);
121
122         zero(h);
123         memcpy(h.signature, HEADER_SIGNATURE, 8);
124         h.header_size = htole64(ALIGN64(sizeof(h)));
125
126         h.incompatible_flags =
127                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
128
129         h.compatible_flags =
130                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
131
132         r = sd_id128_randomize(&h.file_id);
133         if (r < 0)
134                 return r;
135
136         if (template) {
137                 h.seqnum_id = template->header->seqnum_id;
138                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
139         } else
140                 h.seqnum_id = h.file_id;
141
142         k = pwrite(f->fd, &h, sizeof(h), 0);
143         if (k < 0)
144                 return -errno;
145
146         if (k != sizeof(h))
147                 return -EIO;
148
149         return 0;
150 }
151
152 static int journal_file_refresh_header(JournalFile *f) {
153         int r;
154         sd_id128_t boot_id;
155
156         assert(f);
157
158         r = sd_id128_get_machine(&f->header->machine_id);
159         if (r < 0)
160                 return r;
161
162         r = sd_id128_get_boot(&boot_id);
163         if (r < 0)
164                 return r;
165
166         if (sd_id128_equal(boot_id, f->header->boot_id))
167                 f->tail_entry_monotonic_valid = true;
168
169         f->header->boot_id = boot_id;
170
171         f->header->state = STATE_ONLINE;
172
173         /* Sync the online state to disk */
174         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
175         fdatasync(f->fd);
176
177         return 0;
178 }
179
180 static int journal_file_verify_header(JournalFile *f) {
181         assert(f);
182
183         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
184                 return -EBADMSG;
185
186         /* In both read and write mode we refuse to open files with
187          * incompatible flags we don't know */
188 #ifdef HAVE_XZ
189         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
190                 return -EPROTONOSUPPORT;
191 #else
192         if (f->header->incompatible_flags != 0)
193                 return -EPROTONOSUPPORT;
194 #endif
195
196         /* When open for writing we refuse to open files with
197          * compatible flags, too */
198         if (f->writable) {
199 #ifdef HAVE_GCRYPT
200                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
201                         return -EPROTONOSUPPORT;
202 #else
203                 if (f->header->compatible_flags != 0)
204                         return -EPROTONOSUPPORT;
205 #endif
206         }
207
208         if (f->header->state >= _STATE_MAX)
209                 return -EBADMSG;
210
211         /* The first addition was n_data, so check that we are at least this large */
212         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
213                 return -EBADMSG;
214
215         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
216                 return -EBADMSG;
217
218         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
219                 return -ENODATA;
220
221         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
222                 return -ENODATA;
223
224         if (!VALID64(f->header->data_hash_table_offset) ||
225             !VALID64(f->header->field_hash_table_offset) ||
226             !VALID64(f->header->tail_object_offset) ||
227             !VALID64(f->header->entry_array_offset))
228                 return -ENODATA;
229
230         if (f->writable) {
231                 uint8_t state;
232                 sd_id128_t machine_id;
233                 int r;
234
235                 r = sd_id128_get_machine(&machine_id);
236                 if (r < 0)
237                         return r;
238
239                 if (!sd_id128_equal(machine_id, f->header->machine_id))
240                         return -EHOSTDOWN;
241
242                 state = f->header->state;
243
244                 if (state == STATE_ONLINE) {
245                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
246                         return -EBUSY;
247                 } else if (state == STATE_ARCHIVED)
248                         return -ESHUTDOWN;
249                 else if (state != STATE_OFFLINE) {
250                         log_debug("Journal file %s has unknown state %u.", f->path, state);
251                         return -EBUSY;
252                 }
253         }
254
255         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
256
257         if (f->writable)
258                 f->seal = JOURNAL_HEADER_SEALED(f->header);
259
260         return 0;
261 }
262
263 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
264         uint64_t old_size, new_size;
265         int r;
266
267         assert(f);
268
269         /* We assume that this file is not sparse, and we know that
270          * for sure, since we always call posix_fallocate()
271          * ourselves */
272
273         old_size =
274                 le64toh(f->header->header_size) +
275                 le64toh(f->header->arena_size);
276
277         new_size = PAGE_ALIGN(offset + size);
278         if (new_size < le64toh(f->header->header_size))
279                 new_size = le64toh(f->header->header_size);
280
281         if (new_size <= old_size)
282                 return 0;
283
284         if (f->metrics.max_size > 0 &&
285             new_size > f->metrics.max_size)
286                 return -E2BIG;
287
288         if (new_size > f->metrics.min_size &&
289             f->metrics.keep_free > 0) {
290                 struct statvfs svfs;
291
292                 if (fstatvfs(f->fd, &svfs) >= 0) {
293                         uint64_t available;
294
295                         available = svfs.f_bfree * svfs.f_bsize;
296
297                         if (available >= f->metrics.keep_free)
298                                 available -= f->metrics.keep_free;
299                         else
300                                 available = 0;
301
302                         if (new_size - old_size > available)
303                                 return -E2BIG;
304                 }
305         }
306
307         /* Note that the glibc fallocate() fallback is very
308            inefficient, hence we try to minimize the allocation area
309            as we can. */
310         r = posix_fallocate(f->fd, old_size, new_size - old_size);
311         if (r != 0)
312                 return -r;
313
314         if (fstat(f->fd, &f->last_stat) < 0)
315                 return -errno;
316
317         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
318
319         return 0;
320 }
321
322 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
323         assert(f);
324         assert(ret);
325
326         /* Avoid SIGBUS on invalid accesses */
327         if (offset + size > (uint64_t) f->last_stat.st_size) {
328                 /* Hmm, out of range? Let's refresh the fstat() data
329                  * first, before we trust that check. */
330
331                 if (fstat(f->fd, &f->last_stat) < 0 ||
332                     offset + size > (uint64_t) f->last_stat.st_size)
333                         return -EADDRNOTAVAIL;
334         }
335
336         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
337 }
338
339 static uint64_t minimum_header_size(Object *o) {
340
341         static uint64_t table[] = {
342                 [OBJECT_DATA] = sizeof(DataObject),
343                 [OBJECT_FIELD] = sizeof(FieldObject),
344                 [OBJECT_ENTRY] = sizeof(EntryObject),
345                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
346                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
347                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
348                 [OBJECT_TAG] = sizeof(TagObject),
349         };
350
351         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
352                 return sizeof(ObjectHeader);
353
354         return table[o->object.type];
355 }
356
357 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
358         int r;
359         void *t;
360         Object *o;
361         uint64_t s;
362         unsigned context;
363
364         assert(f);
365         assert(ret);
366
367         /* Objects may only be located at multiple of 64 bit */
368         if (!VALID64(offset))
369                 return -EFAULT;
370
371         /* One context for each type, plus one catch-all for the rest */
372         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
373
374         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
375         if (r < 0)
376                 return r;
377
378         o = (Object*) t;
379         s = le64toh(o->object.size);
380
381         if (s < sizeof(ObjectHeader))
382                 return -EBADMSG;
383
384         if (o->object.type <= OBJECT_UNUSED)
385                 return -EBADMSG;
386
387         if (s < minimum_header_size(o))
388                 return -EBADMSG;
389
390         if (type >= 0 && o->object.type != type)
391                 return -EBADMSG;
392
393         if (s > sizeof(ObjectHeader)) {
394                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
395                 if (r < 0)
396                         return r;
397
398                 o = (Object*) t;
399         }
400
401         *ret = o;
402         return 0;
403 }
404
405 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
406         uint64_t r;
407
408         assert(f);
409
410         r = le64toh(f->header->tail_entry_seqnum) + 1;
411
412         if (seqnum) {
413                 /* If an external seqnum counter was passed, we update
414                  * both the local and the external one, and set it to
415                  * the maximum of both */
416
417                 if (*seqnum + 1 > r)
418                         r = *seqnum + 1;
419
420                 *seqnum = r;
421         }
422
423         f->header->tail_entry_seqnum = htole64(r);
424
425         if (f->header->head_entry_seqnum == 0)
426                 f->header->head_entry_seqnum = htole64(r);
427
428         return r;
429 }
430
431 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
432         int r;
433         uint64_t p;
434         Object *tail, *o;
435         void *t;
436
437         assert(f);
438         assert(type > 0 && type < _OBJECT_TYPE_MAX);
439         assert(size >= sizeof(ObjectHeader));
440         assert(offset);
441         assert(ret);
442
443         p = le64toh(f->header->tail_object_offset);
444         if (p == 0)
445                 p = le64toh(f->header->header_size);
446         else {
447                 r = journal_file_move_to_object(f, -1, p, &tail);
448                 if (r < 0)
449                         return r;
450
451                 p += ALIGN64(le64toh(tail->object.size));
452         }
453
454         r = journal_file_allocate(f, p, size);
455         if (r < 0)
456                 return r;
457
458         r = journal_file_move_to(f, type, false, p, size, &t);
459         if (r < 0)
460                 return r;
461
462         o = (Object*) t;
463
464         zero(o->object);
465         o->object.type = type;
466         o->object.size = htole64(size);
467
468         f->header->tail_object_offset = htole64(p);
469         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
470
471         *ret = o;
472         *offset = p;
473
474         return 0;
475 }
476
477 static int journal_file_setup_data_hash_table(JournalFile *f) {
478         uint64_t s, p;
479         Object *o;
480         int r;
481
482         assert(f);
483
484         /* We estimate that we need 1 hash table entry per 768 of
485            journal file and we want to make sure we never get beyond
486            75% fill level. Calculate the hash table size for the
487            maximum file size based on these metrics. */
488
489         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
490         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
491                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
492
493         log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
494
495         r = journal_file_append_object(f,
496                                        OBJECT_DATA_HASH_TABLE,
497                                        offsetof(Object, hash_table.items) + s,
498                                        &o, &p);
499         if (r < 0)
500                 return r;
501
502         memset(o->hash_table.items, 0, s);
503
504         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505         f->header->data_hash_table_size = htole64(s);
506
507         return 0;
508 }
509
510 static int journal_file_setup_field_hash_table(JournalFile *f) {
511         uint64_t s, p;
512         Object *o;
513         int r;
514
515         assert(f);
516
517         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518         r = journal_file_append_object(f,
519                                        OBJECT_FIELD_HASH_TABLE,
520                                        offsetof(Object, hash_table.items) + s,
521                                        &o, &p);
522         if (r < 0)
523                 return r;
524
525         memset(o->hash_table.items, 0, s);
526
527         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528         f->header->field_hash_table_size = htole64(s);
529
530         return 0;
531 }
532
533 static int journal_file_map_data_hash_table(JournalFile *f) {
534         uint64_t s, p;
535         void *t;
536         int r;
537
538         assert(f);
539
540         p = le64toh(f->header->data_hash_table_offset);
541         s = le64toh(f->header->data_hash_table_size);
542
543         r = journal_file_move_to(f,
544                                  OBJECT_DATA_HASH_TABLE,
545                                  true,
546                                  p, s,
547                                  &t);
548         if (r < 0)
549                 return r;
550
551         f->data_hash_table = t;
552         return 0;
553 }
554
555 static int journal_file_map_field_hash_table(JournalFile *f) {
556         uint64_t s, p;
557         void *t;
558         int r;
559
560         assert(f);
561
562         p = le64toh(f->header->field_hash_table_offset);
563         s = le64toh(f->header->field_hash_table_size);
564
565         r = journal_file_move_to(f,
566                                  OBJECT_FIELD_HASH_TABLE,
567                                  true,
568                                  p, s,
569                                  &t);
570         if (r < 0)
571                 return r;
572
573         f->field_hash_table = t;
574         return 0;
575 }
576
577 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
578         uint64_t p, h;
579         int r;
580
581         assert(f);
582         assert(o);
583         assert(offset > 0);
584         assert(o->object.type == OBJECT_DATA);
585
586         /* This might alter the window we are looking at */
587
588         o->data.next_hash_offset = o->data.next_field_offset = 0;
589         o->data.entry_offset = o->data.entry_array_offset = 0;
590         o->data.n_entries = 0;
591
592         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
593         p = le64toh(f->data_hash_table[h].tail_hash_offset);
594         if (p == 0) {
595                 /* Only entry in the hash table is easy */
596                 f->data_hash_table[h].head_hash_offset = htole64(offset);
597         } else {
598                 /* Move back to the previous data object, to patch in
599                  * pointer */
600
601                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
602                 if (r < 0)
603                         return r;
604
605                 o->data.next_hash_offset = htole64(offset);
606         }
607
608         f->data_hash_table[h].tail_hash_offset = htole64(offset);
609
610         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
611                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
612
613         return 0;
614 }
615
616 int journal_file_find_data_object_with_hash(
617                 JournalFile *f,
618                 const void *data, uint64_t size, uint64_t hash,
619                 Object **ret, uint64_t *offset) {
620
621         uint64_t p, osize, h;
622         int r;
623
624         assert(f);
625         assert(data || size == 0);
626
627         osize = offsetof(Object, data.payload) + size;
628
629         if (f->header->data_hash_table_size == 0)
630                 return -EBADMSG;
631
632         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
633         p = le64toh(f->data_hash_table[h].head_hash_offset);
634
635         while (p > 0) {
636                 Object *o;
637
638                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
639                 if (r < 0)
640                         return r;
641
642                 if (le64toh(o->data.hash) != hash)
643                         goto next;
644
645                 if (o->object.flags & OBJECT_COMPRESSED) {
646 #ifdef HAVE_XZ
647                         uint64_t l, rsize;
648
649                         l = le64toh(o->object.size);
650                         if (l <= offsetof(Object, data.payload))
651                                 return -EBADMSG;
652
653                         l -= offsetof(Object, data.payload);
654
655                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
656                                 return -EBADMSG;
657
658                         if (rsize == size &&
659                             memcmp(f->compress_buffer, data, size) == 0) {
660
661                                 if (ret)
662                                         *ret = o;
663
664                                 if (offset)
665                                         *offset = p;
666
667                                 return 1;
668                         }
669 #else
670                         return -EPROTONOSUPPORT;
671 #endif
672
673                 } else if (le64toh(o->object.size) == osize &&
674                            memcmp(o->data.payload, data, size) == 0) {
675
676                         if (ret)
677                                 *ret = o;
678
679                         if (offset)
680                                 *offset = p;
681
682                         return 1;
683                 }
684
685         next:
686                 p = le64toh(o->data.next_hash_offset);
687         }
688
689         return 0;
690 }
691
692 int journal_file_find_data_object(
693                 JournalFile *f,
694                 const void *data, uint64_t size,
695                 Object **ret, uint64_t *offset) {
696
697         uint64_t hash;
698
699         assert(f);
700         assert(data || size == 0);
701
702         hash = hash64(data, size);
703
704         return journal_file_find_data_object_with_hash(f,
705                                                        data, size, hash,
706                                                        ret, offset);
707 }
708
709 static int journal_file_append_data(
710                 JournalFile *f,
711                 const void *data, uint64_t size,
712                 Object **ret, uint64_t *offset) {
713
714         uint64_t hash, p;
715         uint64_t osize;
716         Object *o;
717         int r;
718         bool compressed = false;
719
720         assert(f);
721         assert(data || size == 0);
722
723         hash = hash64(data, size);
724
725         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
726         if (r < 0)
727                 return r;
728         else if (r > 0) {
729
730                 if (ret)
731                         *ret = o;
732
733                 if (offset)
734                         *offset = p;
735
736                 return 0;
737         }
738
739         osize = offsetof(Object, data.payload) + size;
740         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
741         if (r < 0)
742                 return r;
743
744         o->data.hash = htole64(hash);
745
746 #ifdef HAVE_XZ
747         if (f->compress &&
748             size >= COMPRESSION_SIZE_THRESHOLD) {
749                 uint64_t rsize;
750
751                 compressed = compress_blob(data, size, o->data.payload, &rsize);
752
753                 if (compressed) {
754                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
755                         o->object.flags |= OBJECT_COMPRESSED;
756
757                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
758                 }
759         }
760 #endif
761
762         if (!compressed && size > 0)
763                 memcpy(o->data.payload, data, size);
764
765         r = journal_file_link_data(f, o, p, hash);
766         if (r < 0)
767                 return r;
768
769 #ifdef HAVE_GCRYPT
770         r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
771         if (r < 0)
772                 return r;
773 #endif
774
775         /* The linking might have altered the window, so let's
776          * refresh our pointer */
777         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
778         if (r < 0)
779                 return r;
780
781         if (ret)
782                 *ret = o;
783
784         if (offset)
785                 *offset = p;
786
787         return 0;
788 }
789
790 uint64_t journal_file_entry_n_items(Object *o) {
791         assert(o);
792         assert(o->object.type == OBJECT_ENTRY);
793
794         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
795 }
796
797 uint64_t journal_file_entry_array_n_items(Object *o) {
798         assert(o);
799         assert(o->object.type == OBJECT_ENTRY_ARRAY);
800
801         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
802 }
803
804 uint64_t journal_file_hash_table_n_items(Object *o) {
805         assert(o);
806         assert(o->object.type == OBJECT_DATA_HASH_TABLE ||
807                o->object.type == OBJECT_FIELD_HASH_TABLE);
808
809         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
810 }
811
812 static int link_entry_into_array(JournalFile *f,
813                                  le64_t *first,
814                                  le64_t *idx,
815                                  uint64_t p) {
816         int r;
817         uint64_t n = 0, ap = 0, q, i, a, hidx;
818         Object *o;
819
820         assert(f);
821         assert(first);
822         assert(idx);
823         assert(p > 0);
824
825         a = le64toh(*first);
826         i = hidx = le64toh(*idx);
827         while (a > 0) {
828
829                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
830                 if (r < 0)
831                         return r;
832
833                 n = journal_file_entry_array_n_items(o);
834                 if (i < n) {
835                         o->entry_array.items[i] = htole64(p);
836                         *idx = htole64(hidx + 1);
837                         return 0;
838                 }
839
840                 i -= n;
841                 ap = a;
842                 a = le64toh(o->entry_array.next_entry_array_offset);
843         }
844
845         if (hidx > n)
846                 n = (hidx+1) * 2;
847         else
848                 n = n * 2;
849
850         if (n < 4)
851                 n = 4;
852
853         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
854                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
855                                        &o, &q);
856         if (r < 0)
857                 return r;
858
859 #ifdef HAVE_GCRYPT
860         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
861         if (r < 0)
862                 return r;
863 #endif
864
865         o->entry_array.items[i] = htole64(p);
866
867         if (ap == 0)
868                 *first = htole64(q);
869         else {
870                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
871                 if (r < 0)
872                         return r;
873
874                 o->entry_array.next_entry_array_offset = htole64(q);
875         }
876
877         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
878                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
879
880         *idx = htole64(hidx + 1);
881
882         return 0;
883 }
884
885 static int link_entry_into_array_plus_one(JournalFile *f,
886                                           le64_t *extra,
887                                           le64_t *first,
888                                           le64_t *idx,
889                                           uint64_t p) {
890
891         int r;
892
893         assert(f);
894         assert(extra);
895         assert(first);
896         assert(idx);
897         assert(p > 0);
898
899         if (*idx == 0)
900                 *extra = htole64(p);
901         else {
902                 le64_t i;
903
904                 i = htole64(le64toh(*idx) - 1);
905                 r = link_entry_into_array(f, first, &i, p);
906                 if (r < 0)
907                         return r;
908         }
909
910         *idx = htole64(le64toh(*idx) + 1);
911         return 0;
912 }
913
914 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
915         uint64_t p;
916         int r;
917         assert(f);
918         assert(o);
919         assert(offset > 0);
920
921         p = le64toh(o->entry.items[i].object_offset);
922         if (p == 0)
923                 return -EINVAL;
924
925         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
926         if (r < 0)
927                 return r;
928
929         return link_entry_into_array_plus_one(f,
930                                               &o->data.entry_offset,
931                                               &o->data.entry_array_offset,
932                                               &o->data.n_entries,
933                                               offset);
934 }
935
936 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
937         uint64_t n, i;
938         int r;
939
940         assert(f);
941         assert(o);
942         assert(offset > 0);
943         assert(o->object.type == OBJECT_ENTRY);
944
945         __sync_synchronize();
946
947         /* Link up the entry itself */
948         r = link_entry_into_array(f,
949                                   &f->header->entry_array_offset,
950                                   &f->header->n_entries,
951                                   offset);
952         if (r < 0)
953                 return r;
954
955         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
956
957         if (f->header->head_entry_realtime == 0)
958                 f->header->head_entry_realtime = o->entry.realtime;
959
960         f->header->tail_entry_realtime = o->entry.realtime;
961         f->header->tail_entry_monotonic = o->entry.monotonic;
962
963         f->tail_entry_monotonic_valid = true;
964
965         /* Link up the items */
966         n = journal_file_entry_n_items(o);
967         for (i = 0; i < n; i++) {
968                 r = journal_file_link_entry_item(f, o, offset, i);
969                 if (r < 0)
970                         return r;
971         }
972
973         return 0;
974 }
975
976 static int journal_file_append_entry_internal(
977                 JournalFile *f,
978                 const dual_timestamp *ts,
979                 uint64_t xor_hash,
980                 const EntryItem items[], unsigned n_items,
981                 uint64_t *seqnum,
982                 Object **ret, uint64_t *offset) {
983         uint64_t np;
984         uint64_t osize;
985         Object *o;
986         int r;
987
988         assert(f);
989         assert(items || n_items == 0);
990         assert(ts);
991
992         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
993
994         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
995         if (r < 0)
996                 return r;
997
998         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
999         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1000         o->entry.realtime = htole64(ts->realtime);
1001         o->entry.monotonic = htole64(ts->monotonic);
1002         o->entry.xor_hash = htole64(xor_hash);
1003         o->entry.boot_id = f->header->boot_id;
1004
1005 #ifdef HAVE_GCRYPT
1006         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
1007         if (r < 0)
1008                 return r;
1009 #endif
1010
1011         r = journal_file_link_entry(f, o, np);
1012         if (r < 0)
1013                 return r;
1014
1015         if (ret)
1016                 *ret = o;
1017
1018         if (offset)
1019                 *offset = np;
1020
1021         return 0;
1022 }
1023
1024 void journal_file_post_change(JournalFile *f) {
1025         assert(f);
1026
1027         /* inotify() does not receive IN_MODIFY events from file
1028          * accesses done via mmap(). After each access we hence
1029          * trigger IN_MODIFY by truncating the journal file to its
1030          * current size which triggers IN_MODIFY. */
1031
1032         __sync_synchronize();
1033
1034         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1035                 log_error("Failed to to truncate file to its own size: %m");
1036 }
1037
1038 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1039         unsigned i;
1040         EntryItem *items;
1041         int r;
1042         uint64_t xor_hash = 0;
1043         struct dual_timestamp _ts;
1044
1045         assert(f);
1046         assert(iovec || n_iovec == 0);
1047
1048         if (!f->writable)
1049                 return -EPERM;
1050
1051         if (!ts) {
1052                 dual_timestamp_get(&_ts);
1053                 ts = &_ts;
1054         }
1055
1056         if (f->tail_entry_monotonic_valid &&
1057             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1058                 return -EINVAL;
1059
1060 #ifdef HAVE_GCRYPT
1061         r = journal_file_maybe_append_tag(f, ts->realtime);
1062         if (r < 0)
1063                 return r;
1064 #endif
1065
1066         /* alloca() can't take 0, hence let's allocate at least one */
1067         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1068
1069         for (i = 0; i < n_iovec; i++) {
1070                 uint64_t p;
1071                 Object *o;
1072
1073                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1074                 if (r < 0)
1075                         return r;
1076
1077                 xor_hash ^= le64toh(o->data.hash);
1078                 items[i].object_offset = htole64(p);
1079                 items[i].hash = o->data.hash;
1080         }
1081
1082         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1083
1084         journal_file_post_change(f);
1085
1086         return r;
1087 }
1088
1089 static int generic_array_get(JournalFile *f,
1090                              uint64_t first,
1091                              uint64_t i,
1092                              Object **ret, uint64_t *offset) {
1093
1094         Object *o;
1095         uint64_t p = 0, a;
1096         int r;
1097
1098         assert(f);
1099
1100         a = first;
1101         while (a > 0) {
1102                 uint64_t n;
1103
1104                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1105                 if (r < 0)
1106                         return r;
1107
1108                 n = journal_file_entry_array_n_items(o);
1109                 if (i < n) {
1110                         p = le64toh(o->entry_array.items[i]);
1111                         break;
1112                 }
1113
1114                 i -= n;
1115                 a = le64toh(o->entry_array.next_entry_array_offset);
1116         }
1117
1118         if (a <= 0 || p <= 0)
1119                 return 0;
1120
1121         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1122         if (r < 0)
1123                 return r;
1124
1125         if (ret)
1126                 *ret = o;
1127
1128         if (offset)
1129                 *offset = p;
1130
1131         return 1;
1132 }
1133
1134 static int generic_array_get_plus_one(JournalFile *f,
1135                                       uint64_t extra,
1136                                       uint64_t first,
1137                                       uint64_t i,
1138                                       Object **ret, uint64_t *offset) {
1139
1140         Object *o;
1141
1142         assert(f);
1143
1144         if (i == 0) {
1145                 int r;
1146
1147                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1148                 if (r < 0)
1149                         return r;
1150
1151                 if (ret)
1152                         *ret = o;
1153
1154                 if (offset)
1155                         *offset = extra;
1156
1157                 return 1;
1158         }
1159
1160         return generic_array_get(f, first, i-1, ret, offset);
1161 }
1162
1163 enum {
1164         TEST_FOUND,
1165         TEST_LEFT,
1166         TEST_RIGHT
1167 };
1168
1169 static int generic_array_bisect(JournalFile *f,
1170                                 uint64_t first,
1171                                 uint64_t n,
1172                                 uint64_t needle,
1173                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1174                                 direction_t direction,
1175                                 Object **ret,
1176                                 uint64_t *offset,
1177                                 uint64_t *idx) {
1178
1179         uint64_t a, p, t = 0, i = 0, last_p = 0;
1180         bool subtract_one = false;
1181         Object *o, *array = NULL;
1182         int r;
1183
1184         assert(f);
1185         assert(test_object);
1186
1187         a = first;
1188         while (a > 0) {
1189                 uint64_t left, right, k, lp;
1190
1191                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1192                 if (r < 0)
1193                         return r;
1194
1195                 k = journal_file_entry_array_n_items(array);
1196                 right = MIN(k, n);
1197                 if (right <= 0)
1198                         return 0;
1199
1200                 i = right - 1;
1201                 lp = p = le64toh(array->entry_array.items[i]);
1202                 if (p <= 0)
1203                         return -EBADMSG;
1204
1205                 r = test_object(f, p, needle);
1206                 if (r < 0)
1207                         return r;
1208
1209                 if (r == TEST_FOUND)
1210                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1211
1212                 if (r == TEST_RIGHT) {
1213                         left = 0;
1214                         right -= 1;
1215                         for (;;) {
1216                                 if (left == right) {
1217                                         if (direction == DIRECTION_UP)
1218                                                 subtract_one = true;
1219
1220                                         i = left;
1221                                         goto found;
1222                                 }
1223
1224                                 assert(left < right);
1225
1226                                 i = (left + right) / 2;
1227                                 p = le64toh(array->entry_array.items[i]);
1228                                 if (p <= 0)
1229                                         return -EBADMSG;
1230
1231                                 r = test_object(f, p, needle);
1232                                 if (r < 0)
1233                                         return r;
1234
1235                                 if (r == TEST_FOUND)
1236                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1237
1238                                 if (r == TEST_RIGHT)
1239                                         right = i;
1240                                 else
1241                                         left = i + 1;
1242                         }
1243                 }
1244
1245                 if (k > n) {
1246                         if (direction == DIRECTION_UP) {
1247                                 i = n;
1248                                 subtract_one = true;
1249                                 goto found;
1250                         }
1251
1252                         return 0;
1253                 }
1254
1255                 last_p = lp;
1256
1257                 n -= k;
1258                 t += k;
1259                 a = le64toh(array->entry_array.next_entry_array_offset);
1260         }
1261
1262         return 0;
1263
1264 found:
1265         if (subtract_one && t == 0 && i == 0)
1266                 return 0;
1267
1268         if (subtract_one && i == 0)
1269                 p = last_p;
1270         else if (subtract_one)
1271                 p = le64toh(array->entry_array.items[i-1]);
1272         else
1273                 p = le64toh(array->entry_array.items[i]);
1274
1275         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1276         if (r < 0)
1277                 return r;
1278
1279         if (ret)
1280                 *ret = o;
1281
1282         if (offset)
1283                 *offset = p;
1284
1285         if (idx)
1286                 *idx = t + i + (subtract_one ? -1 : 0);
1287
1288         return 1;
1289 }
1290
1291 static int generic_array_bisect_plus_one(JournalFile *f,
1292                                          uint64_t extra,
1293                                          uint64_t first,
1294                                          uint64_t n,
1295                                          uint64_t needle,
1296                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1297                                          direction_t direction,
1298                                          Object **ret,
1299                                          uint64_t *offset,
1300                                          uint64_t *idx) {
1301
1302         int r;
1303         bool step_back = false;
1304         Object *o;
1305
1306         assert(f);
1307         assert(test_object);
1308
1309         if (n <= 0)
1310                 return 0;
1311
1312         /* This bisects the array in object 'first', but first checks
1313          * an extra  */
1314         r = test_object(f, extra, needle);
1315         if (r < 0)
1316                 return r;
1317
1318         if (r == TEST_FOUND)
1319                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1320
1321         /* if we are looking with DIRECTION_UP then we need to first
1322            see if in the actual array there is a matching entry, and
1323            return the last one of that. But if there isn't any we need
1324            to return this one. Hence remember this, and return it
1325            below. */
1326         if (r == TEST_LEFT)
1327                 step_back = direction == DIRECTION_UP;
1328
1329         if (r == TEST_RIGHT) {
1330                 if (direction == DIRECTION_DOWN)
1331                         goto found;
1332                 else
1333                         return 0;
1334         }
1335
1336         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1337
1338         if (r == 0 && step_back)
1339                 goto found;
1340
1341         if (r > 0 && idx)
1342                 (*idx) ++;
1343
1344         return r;
1345
1346 found:
1347         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1348         if (r < 0)
1349                 return r;
1350
1351         if (ret)
1352                 *ret = o;
1353
1354         if (offset)
1355                 *offset = extra;
1356
1357         if (idx)
1358                 *idx = 0;
1359
1360         return 1;
1361 }
1362
1363 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1364         assert(f);
1365         assert(p > 0);
1366
1367         if (p == needle)
1368                 return TEST_FOUND;
1369         else if (p < needle)
1370                 return TEST_LEFT;
1371         else
1372                 return TEST_RIGHT;
1373 }
1374
1375 int journal_file_move_to_entry_by_offset(
1376                 JournalFile *f,
1377                 uint64_t p,
1378                 direction_t direction,
1379                 Object **ret,
1380                 uint64_t *offset) {
1381
1382         return generic_array_bisect(f,
1383                                     le64toh(f->header->entry_array_offset),
1384                                     le64toh(f->header->n_entries),
1385                                     p,
1386                                     test_object_offset,
1387                                     direction,
1388                                     ret, offset, NULL);
1389 }
1390
1391
1392 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1393         Object *o;
1394         int r;
1395
1396         assert(f);
1397         assert(p > 0);
1398
1399         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1400         if (r < 0)
1401                 return r;
1402
1403         if (le64toh(o->entry.seqnum) == needle)
1404                 return TEST_FOUND;
1405         else if (le64toh(o->entry.seqnum) < needle)
1406                 return TEST_LEFT;
1407         else
1408                 return TEST_RIGHT;
1409 }
1410
1411 int journal_file_move_to_entry_by_seqnum(
1412                 JournalFile *f,
1413                 uint64_t seqnum,
1414                 direction_t direction,
1415                 Object **ret,
1416                 uint64_t *offset) {
1417
1418         return generic_array_bisect(f,
1419                                     le64toh(f->header->entry_array_offset),
1420                                     le64toh(f->header->n_entries),
1421                                     seqnum,
1422                                     test_object_seqnum,
1423                                     direction,
1424                                     ret, offset, NULL);
1425 }
1426
1427 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1428         Object *o;
1429         int r;
1430
1431         assert(f);
1432         assert(p > 0);
1433
1434         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1435         if (r < 0)
1436                 return r;
1437
1438         if (le64toh(o->entry.realtime) == needle)
1439                 return TEST_FOUND;
1440         else if (le64toh(o->entry.realtime) < needle)
1441                 return TEST_LEFT;
1442         else
1443                 return TEST_RIGHT;
1444 }
1445
1446 int journal_file_move_to_entry_by_realtime(
1447                 JournalFile *f,
1448                 uint64_t realtime,
1449                 direction_t direction,
1450                 Object **ret,
1451                 uint64_t *offset) {
1452
1453         return generic_array_bisect(f,
1454                                     le64toh(f->header->entry_array_offset),
1455                                     le64toh(f->header->n_entries),
1456                                     realtime,
1457                                     test_object_realtime,
1458                                     direction,
1459                                     ret, offset, NULL);
1460 }
1461
1462 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1463         Object *o;
1464         int r;
1465
1466         assert(f);
1467         assert(p > 0);
1468
1469         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1470         if (r < 0)
1471                 return r;
1472
1473         if (le64toh(o->entry.monotonic) == needle)
1474                 return TEST_FOUND;
1475         else if (le64toh(o->entry.monotonic) < needle)
1476                 return TEST_LEFT;
1477         else
1478                 return TEST_RIGHT;
1479 }
1480
1481 int journal_file_move_to_entry_by_monotonic(
1482                 JournalFile *f,
1483                 sd_id128_t boot_id,
1484                 uint64_t monotonic,
1485                 direction_t direction,
1486                 Object **ret,
1487                 uint64_t *offset) {
1488
1489         char t[9+32+1] = "_BOOT_ID=";
1490         Object *o;
1491         int r;
1492
1493         assert(f);
1494
1495         sd_id128_to_string(boot_id, t + 9);
1496         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1497         if (r < 0)
1498                 return r;
1499         if (r == 0)
1500                 return -ENOENT;
1501
1502         return generic_array_bisect_plus_one(f,
1503                                              le64toh(o->data.entry_offset),
1504                                              le64toh(o->data.entry_array_offset),
1505                                              le64toh(o->data.n_entries),
1506                                              monotonic,
1507                                              test_object_monotonic,
1508                                              direction,
1509                                              ret, offset, NULL);
1510 }
1511
1512 int journal_file_next_entry(
1513                 JournalFile *f,
1514                 Object *o, uint64_t p,
1515                 direction_t direction,
1516                 Object **ret, uint64_t *offset) {
1517
1518         uint64_t i, n;
1519         int r;
1520
1521         assert(f);
1522         assert(p > 0 || !o);
1523
1524         n = le64toh(f->header->n_entries);
1525         if (n <= 0)
1526                 return 0;
1527
1528         if (!o)
1529                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1530         else {
1531                 if (o->object.type != OBJECT_ENTRY)
1532                         return -EINVAL;
1533
1534                 r = generic_array_bisect(f,
1535                                          le64toh(f->header->entry_array_offset),
1536                                          le64toh(f->header->n_entries),
1537                                          p,
1538                                          test_object_offset,
1539                                          DIRECTION_DOWN,
1540                                          NULL, NULL,
1541                                          &i);
1542                 if (r <= 0)
1543                         return r;
1544
1545                 if (direction == DIRECTION_DOWN) {
1546                         if (i >= n - 1)
1547                                 return 0;
1548
1549                         i++;
1550                 } else {
1551                         if (i <= 0)
1552                                 return 0;
1553
1554                         i--;
1555                 }
1556         }
1557
1558         /* And jump to it */
1559         return generic_array_get(f,
1560                                  le64toh(f->header->entry_array_offset),
1561                                  i,
1562                                  ret, offset);
1563 }
1564
1565 int journal_file_skip_entry(
1566                 JournalFile *f,
1567                 Object *o, uint64_t p,
1568                 int64_t skip,
1569                 Object **ret, uint64_t *offset) {
1570
1571         uint64_t i, n;
1572         int r;
1573
1574         assert(f);
1575         assert(o);
1576         assert(p > 0);
1577
1578         if (o->object.type != OBJECT_ENTRY)
1579                 return -EINVAL;
1580
1581         r = generic_array_bisect(f,
1582                                  le64toh(f->header->entry_array_offset),
1583                                  le64toh(f->header->n_entries),
1584                                  p,
1585                                  test_object_offset,
1586                                  DIRECTION_DOWN,
1587                                  NULL, NULL,
1588                                  &i);
1589         if (r <= 0)
1590                 return r;
1591
1592         /* Calculate new index */
1593         if (skip < 0) {
1594                 if ((uint64_t) -skip >= i)
1595                         i = 0;
1596                 else
1597                         i = i - (uint64_t) -skip;
1598         } else
1599                 i  += (uint64_t) skip;
1600
1601         n = le64toh(f->header->n_entries);
1602         if (n <= 0)
1603                 return -EBADMSG;
1604
1605         if (i >= n)
1606                 i = n-1;
1607
1608         return generic_array_get(f,
1609                                  le64toh(f->header->entry_array_offset),
1610                                  i,
1611                                  ret, offset);
1612 }
1613
1614 int journal_file_next_entry_for_data(
1615                 JournalFile *f,
1616                 Object *o, uint64_t p,
1617                 uint64_t data_offset,
1618                 direction_t direction,
1619                 Object **ret, uint64_t *offset) {
1620
1621         uint64_t n, i;
1622         int r;
1623         Object *d;
1624
1625         assert(f);
1626         assert(p > 0 || !o);
1627
1628         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1629         if (r < 0)
1630                 return r;
1631
1632         n = le64toh(d->data.n_entries);
1633         if (n <= 0)
1634                 return n;
1635
1636         if (!o)
1637                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1638         else {
1639                 if (o->object.type != OBJECT_ENTRY)
1640                         return -EINVAL;
1641
1642                 r = generic_array_bisect_plus_one(f,
1643                                                   le64toh(d->data.entry_offset),
1644                                                   le64toh(d->data.entry_array_offset),
1645                                                   le64toh(d->data.n_entries),
1646                                                   p,
1647                                                   test_object_offset,
1648                                                   DIRECTION_DOWN,
1649                                                   NULL, NULL,
1650                                                   &i);
1651
1652                 if (r <= 0)
1653                         return r;
1654
1655                 if (direction == DIRECTION_DOWN) {
1656                         if (i >= n - 1)
1657                                 return 0;
1658
1659                         i++;
1660                 } else {
1661                         if (i <= 0)
1662                                 return 0;
1663
1664                         i--;
1665                 }
1666
1667         }
1668
1669         return generic_array_get_plus_one(f,
1670                                           le64toh(d->data.entry_offset),
1671                                           le64toh(d->data.entry_array_offset),
1672                                           i,
1673                                           ret, offset);
1674 }
1675
1676 int journal_file_move_to_entry_by_offset_for_data(
1677                 JournalFile *f,
1678                 uint64_t data_offset,
1679                 uint64_t p,
1680                 direction_t direction,
1681                 Object **ret, uint64_t *offset) {
1682
1683         int r;
1684         Object *d;
1685
1686         assert(f);
1687
1688         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1689         if (r < 0)
1690                 return r;
1691
1692         return generic_array_bisect_plus_one(f,
1693                                              le64toh(d->data.entry_offset),
1694                                              le64toh(d->data.entry_array_offset),
1695                                              le64toh(d->data.n_entries),
1696                                              p,
1697                                              test_object_offset,
1698                                              direction,
1699                                              ret, offset, NULL);
1700 }
1701
1702 int journal_file_move_to_entry_by_monotonic_for_data(
1703                 JournalFile *f,
1704                 uint64_t data_offset,
1705                 sd_id128_t boot_id,
1706                 uint64_t monotonic,
1707                 direction_t direction,
1708                 Object **ret, uint64_t *offset) {
1709
1710         char t[9+32+1] = "_BOOT_ID=";
1711         Object *o, *d;
1712         int r;
1713         uint64_t b, z;
1714
1715         assert(f);
1716
1717         /* First, seek by time */
1718         sd_id128_to_string(boot_id, t + 9);
1719         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1720         if (r < 0)
1721                 return r;
1722         if (r == 0)
1723                 return -ENOENT;
1724
1725         r = generic_array_bisect_plus_one(f,
1726                                           le64toh(o->data.entry_offset),
1727                                           le64toh(o->data.entry_array_offset),
1728                                           le64toh(o->data.n_entries),
1729                                           monotonic,
1730                                           test_object_monotonic,
1731                                           direction,
1732                                           NULL, &z, NULL);
1733         if (r <= 0)
1734                 return r;
1735
1736         /* And now, continue seeking until we find an entry that
1737          * exists in both bisection arrays */
1738
1739         for (;;) {
1740                 Object *qo;
1741                 uint64_t p, q;
1742
1743                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1744                 if (r < 0)
1745                         return r;
1746
1747                 r = generic_array_bisect_plus_one(f,
1748                                                   le64toh(d->data.entry_offset),
1749                                                   le64toh(d->data.entry_array_offset),
1750                                                   le64toh(d->data.n_entries),
1751                                                   z,
1752                                                   test_object_offset,
1753                                                   direction,
1754                                                   NULL, &p, NULL);
1755                 if (r <= 0)
1756                         return r;
1757
1758                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1759                 if (r < 0)
1760                         return r;
1761
1762                 r = generic_array_bisect_plus_one(f,
1763                                                   le64toh(o->data.entry_offset),
1764                                                   le64toh(o->data.entry_array_offset),
1765                                                   le64toh(o->data.n_entries),
1766                                                   p,
1767                                                   test_object_offset,
1768                                                   direction,
1769                                                   &qo, &q, NULL);
1770
1771                 if (r <= 0)
1772                         return r;
1773
1774                 if (p == q) {
1775                         if (ret)
1776                                 *ret = qo;
1777                         if (offset)
1778                                 *offset = q;
1779
1780                         return 1;
1781                 }
1782
1783                 z = q;
1784         }
1785
1786         return 0;
1787 }
1788
1789 int journal_file_move_to_entry_by_seqnum_for_data(
1790                 JournalFile *f,
1791                 uint64_t data_offset,
1792                 uint64_t seqnum,
1793                 direction_t direction,
1794                 Object **ret, uint64_t *offset) {
1795
1796         Object *d;
1797         int r;
1798
1799         assert(f);
1800
1801         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1802         if (r < 0)
1803                 return r;
1804
1805         return generic_array_bisect_plus_one(f,
1806                                              le64toh(d->data.entry_offset),
1807                                              le64toh(d->data.entry_array_offset),
1808                                              le64toh(d->data.n_entries),
1809                                              seqnum,
1810                                              test_object_seqnum,
1811                                              direction,
1812                                              ret, offset, NULL);
1813 }
1814
1815 int journal_file_move_to_entry_by_realtime_for_data(
1816                 JournalFile *f,
1817                 uint64_t data_offset,
1818                 uint64_t realtime,
1819                 direction_t direction,
1820                 Object **ret, uint64_t *offset) {
1821
1822         Object *d;
1823         int r;
1824
1825         assert(f);
1826
1827         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1828         if (r < 0)
1829                 return r;
1830
1831         return generic_array_bisect_plus_one(f,
1832                                              le64toh(d->data.entry_offset),
1833                                              le64toh(d->data.entry_array_offset),
1834                                              le64toh(d->data.n_entries),
1835                                              realtime,
1836                                              test_object_realtime,
1837                                              direction,
1838                                              ret, offset, NULL);
1839 }
1840
1841 void journal_file_dump(JournalFile *f) {
1842         Object *o;
1843         int r;
1844         uint64_t p;
1845
1846         assert(f);
1847
1848         journal_file_print_header(f);
1849
1850         p = le64toh(f->header->header_size);
1851         while (p != 0) {
1852                 r = journal_file_move_to_object(f, -1, p, &o);
1853                 if (r < 0)
1854                         goto fail;
1855
1856                 switch (o->object.type) {
1857
1858                 case OBJECT_UNUSED:
1859                         printf("Type: OBJECT_UNUSED\n");
1860                         break;
1861
1862                 case OBJECT_DATA:
1863                         printf("Type: OBJECT_DATA\n");
1864                         break;
1865
1866                 case OBJECT_ENTRY:
1867                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
1868                                (unsigned long long) le64toh(o->entry.seqnum),
1869                                (unsigned long long) le64toh(o->entry.monotonic),
1870                                (unsigned long long) le64toh(o->entry.realtime));
1871                         break;
1872
1873                 case OBJECT_FIELD_HASH_TABLE:
1874                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1875                         break;
1876
1877                 case OBJECT_DATA_HASH_TABLE:
1878                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
1879                         break;
1880
1881                 case OBJECT_ENTRY_ARRAY:
1882                         printf("Type: OBJECT_ENTRY_ARRAY\n");
1883                         break;
1884
1885                 case OBJECT_TAG:
1886                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1887                                (unsigned long long) le64toh(o->tag.seqnum),
1888                                (unsigned long long) le64toh(o->tag.epoch));
1889                         break;
1890                 }
1891
1892                 if (o->object.flags & OBJECT_COMPRESSED)
1893                         printf("Flags: COMPRESSED\n");
1894
1895                 if (p == le64toh(f->header->tail_object_offset))
1896                         p = 0;
1897                 else
1898                         p = p + ALIGN64(le64toh(o->object.size));
1899         }
1900
1901         return;
1902 fail:
1903         log_error("File corrupt");
1904 }
1905
1906 void journal_file_print_header(JournalFile *f) {
1907         char a[33], b[33], c[33];
1908         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1909
1910         assert(f);
1911
1912         printf("File Path: %s\n"
1913                "File ID: %s\n"
1914                "Machine ID: %s\n"
1915                "Boot ID: %s\n"
1916                "Sequential Number ID: %s\n"
1917                "State: %s\n"
1918                "Compatible Flags:%s%s\n"
1919                "Incompatible Flags:%s%s\n"
1920                "Header size: %llu\n"
1921                "Arena size: %llu\n"
1922                "Data Hash Table Size: %llu\n"
1923                "Field Hash Table Size: %llu\n"
1924                "Rotate Suggested: %s\n"
1925                "Head Sequential Number: %llu\n"
1926                "Tail Sequential Number: %llu\n"
1927                "Head Realtime Timestamp: %s\n"
1928                "Tail Realtime Timestamp: %s\n"
1929                "Objects: %llu\n"
1930                "Entry Objects: %llu\n",
1931                f->path,
1932                sd_id128_to_string(f->header->file_id, a),
1933                sd_id128_to_string(f->header->machine_id, b),
1934                sd_id128_to_string(f->header->boot_id, c),
1935                sd_id128_to_string(f->header->seqnum_id, c),
1936                f->header->state == STATE_OFFLINE ? "OFFLINE" :
1937                f->header->state == STATE_ONLINE ? "ONLINE" :
1938                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
1939                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
1940                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1941                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
1942                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
1943                (unsigned long long) le64toh(f->header->header_size),
1944                (unsigned long long) le64toh(f->header->arena_size),
1945                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1946                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1947                yes_no(journal_file_rotate_suggested(f)),
1948                (unsigned long long) le64toh(f->header->head_entry_seqnum),
1949                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1950                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1951                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1952                (unsigned long long) le64toh(f->header->n_objects),
1953                (unsigned long long) le64toh(f->header->n_entries));
1954
1955         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1956                 printf("Data Objects: %llu\n"
1957                        "Data Hash Table Fill: %.1f%%\n",
1958                        (unsigned long long) le64toh(f->header->n_data),
1959                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1960
1961         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1962                 printf("Field Objects: %llu\n"
1963                        "Field Hash Table Fill: %.1f%%\n",
1964                        (unsigned long long) le64toh(f->header->n_fields),
1965                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1966
1967         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
1968                 printf("Tag Objects: %llu\n",
1969                        (unsigned long long) le64toh(f->header->n_tags));
1970         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1971                 printf("Entry Array Objects: %llu\n",
1972                        (unsigned long long) le64toh(f->header->n_entry_arrays));
1973 }
1974
1975 int journal_file_open(
1976                 const char *fname,
1977                 int flags,
1978                 mode_t mode,
1979                 bool compress,
1980                 bool seal,
1981                 JournalMetrics *metrics,
1982                 MMapCache *mmap_cache,
1983                 JournalFile *template,
1984                 JournalFile **ret) {
1985
1986         JournalFile *f;
1987         int r;
1988         bool newly_created = false;
1989
1990         assert(fname);
1991
1992         if ((flags & O_ACCMODE) != O_RDONLY &&
1993             (flags & O_ACCMODE) != O_RDWR)
1994                 return -EINVAL;
1995
1996         if (!endswith(fname, ".journal") &&
1997             !endswith(fname, ".journal~"))
1998                 return -EINVAL;
1999
2000         f = new0(JournalFile, 1);
2001         if (!f)
2002                 return -ENOMEM;
2003
2004         f->fd = -1;
2005         f->mode = mode;
2006
2007         f->flags = flags;
2008         f->prot = prot_from_flags(flags);
2009         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2010         f->compress = compress;
2011         f->seal = seal;
2012
2013         if (mmap_cache)
2014                 f->mmap = mmap_cache_ref(mmap_cache);
2015         else {
2016                 f->mmap = mmap_cache_new();
2017                 if (!f->mmap) {
2018                         r = -ENOMEM;
2019                         goto fail;
2020                 }
2021         }
2022
2023         f->path = strdup(fname);
2024         if (!f->path) {
2025                 r = -ENOMEM;
2026                 goto fail;
2027         }
2028
2029         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2030         if (f->fd < 0) {
2031                 r = -errno;
2032                 goto fail;
2033         }
2034
2035         if (fstat(f->fd, &f->last_stat) < 0) {
2036                 r = -errno;
2037                 goto fail;
2038         }
2039
2040         if (f->last_stat.st_size == 0 && f->writable) {
2041                 newly_created = true;
2042
2043 #ifdef HAVE_GCRYPT
2044                 /* Try to load the FSPRG state, and if we can't, then
2045                  * just don't do sealing */
2046                 r = journal_file_fss_load(f);
2047                 if (r < 0)
2048                         f->seal = false;
2049 #endif
2050
2051                 r = journal_file_init_header(f, template);
2052                 if (r < 0)
2053                         goto fail;
2054
2055                 if (fstat(f->fd, &f->last_stat) < 0) {
2056                         r = -errno;
2057                         goto fail;
2058                 }
2059         }
2060
2061         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2062                 r = -EIO;
2063                 goto fail;
2064         }
2065
2066         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2067         if (f->header == MAP_FAILED) {
2068                 f->header = NULL;
2069                 r = -errno;
2070                 goto fail;
2071         }
2072
2073         if (!newly_created) {
2074                 r = journal_file_verify_header(f);
2075                 if (r < 0)
2076                         goto fail;
2077         }
2078
2079 #ifdef HAVE_GCRYPT
2080         if (!newly_created && f->writable) {
2081                 r = journal_file_fss_load(f);
2082                 if (r < 0)
2083                         goto fail;
2084         }
2085 #endif
2086
2087         if (f->writable) {
2088                 if (metrics) {
2089                         journal_default_metrics(metrics, f->fd);
2090                         f->metrics = *metrics;
2091                 } else if (template)
2092                         f->metrics = template->metrics;
2093
2094                 r = journal_file_refresh_header(f);
2095                 if (r < 0)
2096                         goto fail;
2097         }
2098
2099 #ifdef HAVE_GCRYPT
2100         r = journal_file_hmac_setup(f);
2101         if (r < 0)
2102                 goto fail;
2103 #endif
2104
2105         if (newly_created) {
2106                 r = journal_file_setup_field_hash_table(f);
2107                 if (r < 0)
2108                         goto fail;
2109
2110                 r = journal_file_setup_data_hash_table(f);
2111                 if (r < 0)
2112                         goto fail;
2113
2114 #ifdef HAVE_GCRYPT
2115                 r = journal_file_append_first_tag(f);
2116                 if (r < 0)
2117                         goto fail;
2118 #endif
2119         }
2120
2121         r = journal_file_map_field_hash_table(f);
2122         if (r < 0)
2123                 goto fail;
2124
2125         r = journal_file_map_data_hash_table(f);
2126         if (r < 0)
2127                 goto fail;
2128
2129         if (ret)
2130                 *ret = f;
2131
2132         return 0;
2133
2134 fail:
2135         journal_file_close(f);
2136
2137         return r;
2138 }
2139
2140 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2141         char *p;
2142         size_t l;
2143         JournalFile *old_file, *new_file = NULL;
2144         int r;
2145
2146         assert(f);
2147         assert(*f);
2148
2149         old_file = *f;
2150
2151         if (!old_file->writable)
2152                 return -EINVAL;
2153
2154         if (!endswith(old_file->path, ".journal"))
2155                 return -EINVAL;
2156
2157         l = strlen(old_file->path);
2158
2159         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2160         if (!p)
2161                 return -ENOMEM;
2162
2163         memcpy(p, old_file->path, l - 8);
2164         p[l-8] = '@';
2165         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2166         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2167                  "-%016llx-%016llx.journal",
2168                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2169                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2170
2171         r = rename(old_file->path, p);
2172         free(p);
2173
2174         if (r < 0)
2175                 return -errno;
2176
2177         old_file->header->state = STATE_ARCHIVED;
2178
2179         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2180         journal_file_close(old_file);
2181
2182         *f = new_file;
2183         return r;
2184 }
2185
2186 int journal_file_open_reliably(
2187                 const char *fname,
2188                 int flags,
2189                 mode_t mode,
2190                 bool compress,
2191                 bool seal,
2192                 JournalMetrics *metrics,
2193                 MMapCache *mmap_cache,
2194                 JournalFile *template,
2195                 JournalFile **ret) {
2196
2197         int r;
2198         size_t l;
2199         char *p;
2200
2201         r = journal_file_open(fname, flags, mode, compress, seal,
2202                               metrics, mmap_cache, template, ret);
2203         if (r != -EBADMSG && /* corrupted */
2204             r != -ENODATA && /* truncated */
2205             r != -EHOSTDOWN && /* other machine */
2206             r != -EPROTONOSUPPORT && /* incompatible feature */
2207             r != -EBUSY && /* unclean shutdown */
2208             r != -ESHUTDOWN /* already archived */)
2209                 return r;
2210
2211         if ((flags & O_ACCMODE) == O_RDONLY)
2212                 return r;
2213
2214         if (!(flags & O_CREAT))
2215                 return r;
2216
2217         if (!endswith(fname, ".journal"))
2218                 return r;
2219
2220         /* The file is corrupted. Rotate it away and try it again (but only once) */
2221
2222         l = strlen(fname);
2223         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2224                      (int) (l-8), fname,
2225                      (unsigned long long) now(CLOCK_REALTIME),
2226                      random_ull()) < 0)
2227                 return -ENOMEM;
2228
2229         r = rename(fname, p);
2230         free(p);
2231         if (r < 0)
2232                 return -errno;
2233
2234         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2235
2236         return journal_file_open(fname, flags, mode, compress, seal,
2237                                  metrics, mmap_cache, template, ret);
2238 }
2239
2240
2241 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2242         uint64_t i, n;
2243         uint64_t q, xor_hash = 0;
2244         int r;
2245         EntryItem *items;
2246         dual_timestamp ts;
2247
2248         assert(from);
2249         assert(to);
2250         assert(o);
2251         assert(p);
2252
2253         if (!to->writable)
2254                 return -EPERM;
2255
2256         ts.monotonic = le64toh(o->entry.monotonic);
2257         ts.realtime = le64toh(o->entry.realtime);
2258
2259         if (to->tail_entry_monotonic_valid &&
2260             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2261                 return -EINVAL;
2262
2263         n = journal_file_entry_n_items(o);
2264         items = alloca(sizeof(EntryItem) * n);
2265
2266         for (i = 0; i < n; i++) {
2267                 uint64_t l, h;
2268                 le64_t le_hash;
2269                 size_t t;
2270                 void *data;
2271                 Object *u;
2272
2273                 q = le64toh(o->entry.items[i].object_offset);
2274                 le_hash = o->entry.items[i].hash;
2275
2276                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2277                 if (r < 0)
2278                         return r;
2279
2280                 if (le_hash != o->data.hash)
2281                         return -EBADMSG;
2282
2283                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2284                 t = (size_t) l;
2285
2286                 /* We hit the limit on 32bit machines */
2287                 if ((uint64_t) t != l)
2288                         return -E2BIG;
2289
2290                 if (o->object.flags & OBJECT_COMPRESSED) {
2291 #ifdef HAVE_XZ
2292                         uint64_t rsize;
2293
2294                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2295                                 return -EBADMSG;
2296
2297                         data = from->compress_buffer;
2298                         l = rsize;
2299 #else
2300                         return -EPROTONOSUPPORT;
2301 #endif
2302                 } else
2303                         data = o->data.payload;
2304
2305                 r = journal_file_append_data(to, data, l, &u, &h);
2306                 if (r < 0)
2307                         return r;
2308
2309                 xor_hash ^= le64toh(u->data.hash);
2310                 items[i].object_offset = htole64(h);
2311                 items[i].hash = u->data.hash;
2312
2313                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2314                 if (r < 0)
2315                         return r;
2316         }
2317
2318         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2319 }
2320
2321 void journal_default_metrics(JournalMetrics *m, int fd) {
2322         uint64_t fs_size = 0;
2323         struct statvfs ss;
2324         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2325
2326         assert(m);
2327         assert(fd >= 0);
2328
2329         if (fstatvfs(fd, &ss) >= 0)
2330                 fs_size = ss.f_frsize * ss.f_blocks;
2331
2332         if (m->max_use == (uint64_t) -1) {
2333
2334                 if (fs_size > 0) {
2335                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2336
2337                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2338                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2339
2340                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2341                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2342                 } else
2343                         m->max_use = DEFAULT_MAX_USE_LOWER;
2344         } else {
2345                 m->max_use = PAGE_ALIGN(m->max_use);
2346
2347                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2348                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2349         }
2350
2351         if (m->max_size == (uint64_t) -1) {
2352                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2353
2354                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2355                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2356         } else
2357                 m->max_size = PAGE_ALIGN(m->max_size);
2358
2359         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2360                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2361
2362         if (m->max_size*2 > m->max_use)
2363                 m->max_use = m->max_size*2;
2364
2365         if (m->min_size == (uint64_t) -1)
2366                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2367         else {
2368                 m->min_size = PAGE_ALIGN(m->min_size);
2369
2370                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2371                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2372
2373                 if (m->min_size > m->max_size)
2374                         m->max_size = m->min_size;
2375         }
2376
2377         if (m->keep_free == (uint64_t) -1) {
2378
2379                 if (fs_size > 0) {
2380                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2381
2382                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2383                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2384
2385                 } else
2386                         m->keep_free = DEFAULT_KEEP_FREE;
2387         }
2388
2389         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2390                  format_bytes(a, sizeof(a), m->max_use),
2391                  format_bytes(b, sizeof(b), m->max_size),
2392                  format_bytes(c, sizeof(c), m->min_size),
2393                  format_bytes(d, sizeof(d), m->keep_free));
2394 }
2395
2396 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2397         assert(f);
2398         assert(from || to);
2399
2400         if (from) {
2401                 if (f->header->head_entry_realtime == 0)
2402                         return -ENOENT;
2403
2404                 *from = le64toh(f->header->head_entry_realtime);
2405         }
2406
2407         if (to) {
2408                 if (f->header->tail_entry_realtime == 0)
2409                         return -ENOENT;
2410
2411                 *to = le64toh(f->header->tail_entry_realtime);
2412         }
2413
2414         return 1;
2415 }
2416
2417 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2418         char t[9+32+1] = "_BOOT_ID=";
2419         Object *o;
2420         uint64_t p;
2421         int r;
2422
2423         assert(f);
2424         assert(from || to);
2425
2426         sd_id128_to_string(boot_id, t + 9);
2427
2428         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2429         if (r <= 0)
2430                 return r;
2431
2432         if (le64toh(o->data.n_entries) <= 0)
2433                 return 0;
2434
2435         if (from) {
2436                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2437                 if (r < 0)
2438                         return r;
2439
2440                 *from = le64toh(o->entry.monotonic);
2441         }
2442
2443         if (to) {
2444                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2445                 if (r < 0)
2446                         return r;
2447
2448                 r = generic_array_get_plus_one(f,
2449                                                le64toh(o->data.entry_offset),
2450                                                le64toh(o->data.entry_array_offset),
2451                                                le64toh(o->data.n_entries)-1,
2452                                                &o, NULL);
2453                 if (r <= 0)
2454                         return r;
2455
2456                 *to = le64toh(o->entry.monotonic);
2457         }
2458
2459         return 1;
2460 }
2461
2462 bool journal_file_rotate_suggested(JournalFile *f) {
2463         assert(f);
2464
2465         /* If we gained new header fields we gained new features,
2466          * hence suggest a rotation */
2467         if (le64toh(f->header->header_size) < sizeof(Header)) {
2468                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2469                 return true;
2470         }
2471
2472         /* Let's check if the hash tables grew over a certain fill
2473          * level (75%, borrowing this value from Java's hash table
2474          * implementation), and if so suggest a rotation. To calculate
2475          * the fill level we need the n_data field, which only exists
2476          * in newer versions. */
2477
2478         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2479                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2480                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2481                                   f->path,
2482                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2483                                   (unsigned long long) le64toh(f->header->n_data),
2484                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2485                                   (unsigned long long) (f->last_stat.st_size),
2486                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2487                         return true;
2488                 }
2489
2490         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2491                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2492                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2493                                   f->path,
2494                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2495                                   (unsigned long long) le64toh(f->header->n_fields),
2496                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2497                         return true;
2498                 }
2499
2500         return false;
2501 }