chiark / gitweb /
journal: add superficial structure verifier
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34 #include "fsprg.h"
35
36 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
38
39 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
40
41 /* This is the minimum journal file size */
42 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
43
44 /* These are the lower and upper bounds if we deduce the max_use value
45  * from the file system size */
46 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
47 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
48
49 /* This is the upper bound if we deduce max_size from max_use */
50 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
51
52 /* This is the upper bound if we deduce the keep_free value from the
53  * file system size */
54 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
55
56 /* This is the keep_free value when we can't determine the system
57  * size */
58 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
59
60 /* n_data was the first entry we added after the initial file format design */
61 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 #define JOURNAL_HEADER_CONTAINS(h, field) \
66         (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
67
68 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime);
69 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p);
70
71 void journal_file_close(JournalFile *f) {
72         assert(f);
73
74         /* Write the final tag */
75         if (f->authenticate)
76                 journal_file_append_tag(f);
77
78         /* Sync everything to disk, before we mark the file offline */
79         if (f->mmap && f->fd >= 0)
80                 mmap_cache_close_fd(f->mmap, f->fd);
81
82         if (f->writable && f->fd >= 0)
83                 fdatasync(f->fd);
84
85         if (f->header) {
86                 /* Mark the file offline. Don't override the archived state if it already is set */
87                 if (f->writable && f->header->state == STATE_ONLINE)
88                         f->header->state = STATE_OFFLINE;
89
90                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
91         }
92
93         if (f->fd >= 0)
94                 close_nointr_nofail(f->fd);
95
96         free(f->path);
97
98         if (f->mmap)
99                 mmap_cache_unref(f->mmap);
100
101 #ifdef HAVE_XZ
102         free(f->compress_buffer);
103 #endif
104
105 #ifdef HAVE_GCRYPT
106         if (f->fsprg_header)
107                 munmap(f->fsprg_header, PAGE_ALIGN(f->fsprg_size));
108
109         if (f->hmac)
110                 gcry_md_close(f->hmac);
111 #endif
112
113         free(f);
114 }
115
116 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
117         Header h;
118         ssize_t k;
119         int r;
120
121         assert(f);
122
123         zero(h);
124         memcpy(h.signature, HEADER_SIGNATURE, 8);
125         h.header_size = htole64(ALIGN64(sizeof(h)));
126
127         h.incompatible_flags =
128                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
129
130         h.compatible_flags =
131                 htole32(f->authenticate ? HEADER_COMPATIBLE_AUTHENTICATED : 0);
132
133         r = sd_id128_randomize(&h.file_id);
134         if (r < 0)
135                 return r;
136
137         if (template) {
138                 h.seqnum_id = template->header->seqnum_id;
139                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
140         } else
141                 h.seqnum_id = h.file_id;
142
143         k = pwrite(f->fd, &h, sizeof(h), 0);
144         if (k < 0)
145                 return -errno;
146
147         if (k != sizeof(h))
148                 return -EIO;
149
150         return 0;
151 }
152
153 static int journal_file_refresh_header(JournalFile *f) {
154         int r;
155         sd_id128_t boot_id;
156
157         assert(f);
158
159         r = sd_id128_get_machine(&f->header->machine_id);
160         if (r < 0)
161                 return r;
162
163         r = sd_id128_get_boot(&boot_id);
164         if (r < 0)
165                 return r;
166
167         if (sd_id128_equal(boot_id, f->header->boot_id))
168                 f->tail_entry_monotonic_valid = true;
169
170         f->header->boot_id = boot_id;
171
172         f->header->state = STATE_ONLINE;
173
174         /* Sync the online state to disk */
175         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
176         fdatasync(f->fd);
177
178         return 0;
179 }
180
181 static int journal_file_verify_header(JournalFile *f) {
182         assert(f);
183
184         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
185                 return -EBADMSG;
186
187         /* In both read and write mode we refuse to open files with
188          * incompatible flags we don't know */
189 #ifdef HAVE_XZ
190         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
191                 return -EPROTONOSUPPORT;
192 #else
193         if (f->header->incompatible_flags != 0)
194                 return -EPROTONOSUPPORT;
195 #endif
196
197         /* When open for writing we refuse to open files with
198          * compatible flags, too */
199         if (f->writable) {
200 #ifdef HAVE_GCRYPT
201                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_AUTHENTICATED) != 0)
202                         return -EPROTONOSUPPORT;
203 #else
204                 if (f->header->compatible_flags != 0)
205                         return -EPROTONOSUPPORT;
206 #endif
207         }
208
209         /* The first addition was n_data, so check that we are at least this large */
210         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
211                 return -EBADMSG;
212
213         if ((le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED) &&
214                 !JOURNAL_HEADER_CONTAINS(f->header, n_tags))
215                 return -EBADMSG;
216
217         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
218                 return -ENODATA;
219
220         if (f->writable) {
221                 uint8_t state;
222                 sd_id128_t machine_id;
223                 int r;
224
225                 r = sd_id128_get_machine(&machine_id);
226                 if (r < 0)
227                         return r;
228
229                 if (!sd_id128_equal(machine_id, f->header->machine_id))
230                         return -EHOSTDOWN;
231
232                 state = f->header->state;
233
234                 if (state == STATE_ONLINE) {
235                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
236                         return -EBUSY;
237                 } else if (state == STATE_ARCHIVED)
238                         return -ESHUTDOWN;
239                 else if (state != STATE_OFFLINE) {
240                         log_debug("Journal file %s has unknown state %u.", f->path, state);
241                         return -EBUSY;
242                 }
243         }
244
245         f->compress = !!(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED);
246         f->authenticate = !!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED);
247
248         return 0;
249 }
250
251 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
252         uint64_t old_size, new_size;
253         int r;
254
255         assert(f);
256
257         /* We assume that this file is not sparse, and we know that
258          * for sure, since we always call posix_fallocate()
259          * ourselves */
260
261         old_size =
262                 le64toh(f->header->header_size) +
263                 le64toh(f->header->arena_size);
264
265         new_size = PAGE_ALIGN(offset + size);
266         if (new_size < le64toh(f->header->header_size))
267                 new_size = le64toh(f->header->header_size);
268
269         if (new_size <= old_size)
270                 return 0;
271
272         if (f->metrics.max_size > 0 &&
273             new_size > f->metrics.max_size)
274                 return -E2BIG;
275
276         if (new_size > f->metrics.min_size &&
277             f->metrics.keep_free > 0) {
278                 struct statvfs svfs;
279
280                 if (fstatvfs(f->fd, &svfs) >= 0) {
281                         uint64_t available;
282
283                         available = svfs.f_bfree * svfs.f_bsize;
284
285                         if (available >= f->metrics.keep_free)
286                                 available -= f->metrics.keep_free;
287                         else
288                                 available = 0;
289
290                         if (new_size - old_size > available)
291                                 return -E2BIG;
292                 }
293         }
294
295         /* Note that the glibc fallocate() fallback is very
296            inefficient, hence we try to minimize the allocation area
297            as we can. */
298         r = posix_fallocate(f->fd, old_size, new_size - old_size);
299         if (r != 0)
300                 return -r;
301
302         mmap_cache_close_fd_range(f->mmap, f->fd, old_size);
303
304         if (fstat(f->fd, &f->last_stat) < 0)
305                 return -errno;
306
307         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
308
309         return 0;
310 }
311
312 static int journal_file_move_to(JournalFile *f, int context, uint64_t offset, uint64_t size, void **ret) {
313         assert(f);
314         assert(ret);
315
316         /* Avoid SIGBUS on invalid accesses */
317         if (offset + size > (uint64_t) f->last_stat.st_size) {
318                 /* Hmm, out of range? Let's refresh the fstat() data
319                  * first, before we trust that check. */
320
321                 if (fstat(f->fd, &f->last_stat) < 0 ||
322                     offset + size > (uint64_t) f->last_stat.st_size)
323                         return -EADDRNOTAVAIL;
324         }
325
326         return mmap_cache_get(f->mmap, f->fd, f->prot, context, offset, size, ret);
327 }
328
329 static bool verify_hash(Object *o) {
330         uint64_t h1, h2;
331
332         assert(o);
333
334         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
335                 h1 = le64toh(o->data.hash);
336                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
337         } else if (o->object.type == OBJECT_FIELD) {
338                 h1 = le64toh(o->field.hash);
339                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
340         } else
341                 return true;
342
343         return h1 == h2;
344 }
345
346 static uint64_t minimum_header_size(Object *o) {
347
348         static uint64_t table[] = {
349                 [OBJECT_DATA] = sizeof(DataObject),
350                 [OBJECT_FIELD] = sizeof(FieldObject),
351                 [OBJECT_ENTRY] = sizeof(EntryObject),
352                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
353                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
354                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
355                 [OBJECT_TAG] = sizeof(TagObject),
356         };
357
358         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
359                 return sizeof(ObjectHeader);
360
361         return table[o->object.type];
362 }
363
364 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
365         int r;
366         void *t;
367         Object *o;
368         uint64_t s;
369         unsigned context;
370
371         assert(f);
372         assert(ret);
373
374         /* One context for each type, plus one catch-all for the rest */
375         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
376
377         r = journal_file_move_to(f, context, offset, sizeof(ObjectHeader), &t);
378         if (r < 0)
379                 return r;
380
381         o = (Object*) t;
382         s = le64toh(o->object.size);
383
384         if (s < sizeof(ObjectHeader))
385                 return -EBADMSG;
386
387         if (o->object.type <= OBJECT_UNUSED)
388                 return -EBADMSG;
389
390         if (s < minimum_header_size(o))
391                 return -EBADMSG;
392
393         if (type >= 0 && o->object.type != type)
394                 return -EBADMSG;
395
396         if (s > sizeof(ObjectHeader)) {
397                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
398                 if (r < 0)
399                         return r;
400
401                 o = (Object*) t;
402         }
403
404         if (!verify_hash(o))
405                 return -EBADMSG;
406
407         *ret = o;
408         return 0;
409 }
410
411 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
412         uint64_t r;
413
414         assert(f);
415
416         r = le64toh(f->header->tail_entry_seqnum) + 1;
417
418         if (seqnum) {
419                 /* If an external seqnum counter was passed, we update
420                  * both the local and the external one, and set it to
421                  * the maximum of both */
422
423                 if (*seqnum + 1 > r)
424                         r = *seqnum + 1;
425
426                 *seqnum = r;
427         }
428
429         f->header->tail_entry_seqnum = htole64(r);
430
431         if (f->header->head_entry_seqnum == 0)
432                 f->header->head_entry_seqnum = htole64(r);
433
434         return r;
435 }
436
437 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
438         int r;
439         uint64_t p;
440         Object *tail, *o;
441         void *t;
442
443         assert(f);
444         assert(type > 0 && type < _OBJECT_TYPE_MAX);
445         assert(size >= sizeof(ObjectHeader));
446         assert(offset);
447         assert(ret);
448
449         p = le64toh(f->header->tail_object_offset);
450         if (p == 0)
451                 p = le64toh(f->header->header_size);
452         else {
453                 r = journal_file_move_to_object(f, -1, p, &tail);
454                 if (r < 0)
455                         return r;
456
457                 p += ALIGN64(le64toh(tail->object.size));
458         }
459
460         r = journal_file_allocate(f, p, size);
461         if (r < 0)
462                 return r;
463
464         r = journal_file_move_to(f, type, p, size, &t);
465         if (r < 0)
466                 return r;
467
468         o = (Object*) t;
469
470         zero(o->object);
471         o->object.type = type;
472         o->object.size = htole64(size);
473
474         f->header->tail_object_offset = htole64(p);
475         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
476
477         *ret = o;
478         *offset = p;
479
480         return 0;
481 }
482
483 static int journal_file_setup_data_hash_table(JournalFile *f) {
484         uint64_t s, p;
485         Object *o;
486         int r;
487
488         assert(f);
489
490         /* We estimate that we need 1 hash table entry per 768 of
491            journal file and we want to make sure we never get beyond
492            75% fill level. Calculate the hash table size for the
493            maximum file size based on these metrics. */
494
495         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
496         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
497                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
498
499         log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
500
501         r = journal_file_append_object(f,
502                                        OBJECT_DATA_HASH_TABLE,
503                                        offsetof(Object, hash_table.items) + s,
504                                        &o, &p);
505         if (r < 0)
506                 return r;
507
508         memset(o->hash_table.items, 0, s);
509
510         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
511         f->header->data_hash_table_size = htole64(s);
512
513         return 0;
514 }
515
516 static int journal_file_setup_field_hash_table(JournalFile *f) {
517         uint64_t s, p;
518         Object *o;
519         int r;
520
521         assert(f);
522
523         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
524         r = journal_file_append_object(f,
525                                        OBJECT_FIELD_HASH_TABLE,
526                                        offsetof(Object, hash_table.items) + s,
527                                        &o, &p);
528         if (r < 0)
529                 return r;
530
531         memset(o->hash_table.items, 0, s);
532
533         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
534         f->header->field_hash_table_size = htole64(s);
535
536         return 0;
537 }
538
539 static int journal_file_map_data_hash_table(JournalFile *f) {
540         uint64_t s, p;
541         void *t;
542         int r;
543
544         assert(f);
545
546         p = le64toh(f->header->data_hash_table_offset);
547         s = le64toh(f->header->data_hash_table_size);
548
549         r = journal_file_move_to(f,
550                                  OBJECT_DATA_HASH_TABLE,
551                                  p, s,
552                                  &t);
553         if (r < 0)
554                 return r;
555
556         f->data_hash_table = t;
557         return 0;
558 }
559
560 static int journal_file_map_field_hash_table(JournalFile *f) {
561         uint64_t s, p;
562         void *t;
563         int r;
564
565         assert(f);
566
567         p = le64toh(f->header->field_hash_table_offset);
568         s = le64toh(f->header->field_hash_table_size);
569
570         r = journal_file_move_to(f,
571                                  OBJECT_FIELD_HASH_TABLE,
572                                  p, s,
573                                  &t);
574         if (r < 0)
575                 return r;
576
577         f->field_hash_table = t;
578         return 0;
579 }
580
581 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
582         uint64_t p, h;
583         int r;
584
585         assert(f);
586         assert(o);
587         assert(offset > 0);
588         assert(o->object.type == OBJECT_DATA);
589
590         /* This might alter the window we are looking at */
591
592         o->data.next_hash_offset = o->data.next_field_offset = 0;
593         o->data.entry_offset = o->data.entry_array_offset = 0;
594         o->data.n_entries = 0;
595
596         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
597         p = le64toh(f->data_hash_table[h].tail_hash_offset);
598         if (p == 0) {
599                 /* Only entry in the hash table is easy */
600                 f->data_hash_table[h].head_hash_offset = htole64(offset);
601         } else {
602                 /* Move back to the previous data object, to patch in
603                  * pointer */
604
605                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
606                 if (r < 0)
607                         return r;
608
609                 o->data.next_hash_offset = htole64(offset);
610         }
611
612         f->data_hash_table[h].tail_hash_offset = htole64(offset);
613
614         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
615                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
616
617         return 0;
618 }
619
620 int journal_file_find_data_object_with_hash(
621                 JournalFile *f,
622                 const void *data, uint64_t size, uint64_t hash,
623                 Object **ret, uint64_t *offset) {
624
625         uint64_t p, osize, h;
626         int r;
627
628         assert(f);
629         assert(data || size == 0);
630
631         osize = offsetof(Object, data.payload) + size;
632
633         if (f->header->data_hash_table_size == 0)
634                 return -EBADMSG;
635
636         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
637         p = le64toh(f->data_hash_table[h].head_hash_offset);
638
639         while (p > 0) {
640                 Object *o;
641
642                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
643                 if (r < 0)
644                         return r;
645
646                 if (le64toh(o->data.hash) != hash)
647                         goto next;
648
649                 if (o->object.flags & OBJECT_COMPRESSED) {
650 #ifdef HAVE_XZ
651                         uint64_t l, rsize;
652
653                         l = le64toh(o->object.size);
654                         if (l <= offsetof(Object, data.payload))
655                                 return -EBADMSG;
656
657                         l -= offsetof(Object, data.payload);
658
659                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
660                                 return -EBADMSG;
661
662                         if (rsize == size &&
663                             memcmp(f->compress_buffer, data, size) == 0) {
664
665                                 if (ret)
666                                         *ret = o;
667
668                                 if (offset)
669                                         *offset = p;
670
671                                 return 1;
672                         }
673 #else
674                         return -EPROTONOSUPPORT;
675 #endif
676
677                 } else if (le64toh(o->object.size) == osize &&
678                            memcmp(o->data.payload, data, size) == 0) {
679
680                         if (ret)
681                                 *ret = o;
682
683                         if (offset)
684                                 *offset = p;
685
686                         return 1;
687                 }
688
689         next:
690                 p = le64toh(o->data.next_hash_offset);
691         }
692
693         return 0;
694 }
695
696 int journal_file_find_data_object(
697                 JournalFile *f,
698                 const void *data, uint64_t size,
699                 Object **ret, uint64_t *offset) {
700
701         uint64_t hash;
702
703         assert(f);
704         assert(data || size == 0);
705
706         hash = hash64(data, size);
707
708         return journal_file_find_data_object_with_hash(f,
709                                                        data, size, hash,
710                                                        ret, offset);
711 }
712
713 static int journal_file_append_data(
714                 JournalFile *f,
715                 const void *data, uint64_t size,
716                 Object **ret, uint64_t *offset) {
717
718         uint64_t hash, p;
719         uint64_t osize;
720         Object *o;
721         int r;
722         bool compressed = false;
723
724         assert(f);
725         assert(data || size == 0);
726
727         hash = hash64(data, size);
728
729         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
730         if (r < 0)
731                 return r;
732         else if (r > 0) {
733
734                 if (ret)
735                         *ret = o;
736
737                 if (offset)
738                         *offset = p;
739
740                 return 0;
741         }
742
743         osize = offsetof(Object, data.payload) + size;
744         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
745         if (r < 0)
746                 return r;
747
748         o->data.hash = htole64(hash);
749
750 #ifdef HAVE_XZ
751         if (f->compress &&
752             size >= COMPRESSION_SIZE_THRESHOLD) {
753                 uint64_t rsize;
754
755                 compressed = compress_blob(data, size, o->data.payload, &rsize);
756
757                 if (compressed) {
758                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
759                         o->object.flags |= OBJECT_COMPRESSED;
760
761                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
762                 }
763         }
764 #endif
765
766         if (!compressed && size > 0)
767                 memcpy(o->data.payload, data, size);
768
769         r = journal_file_link_data(f, o, p, hash);
770         if (r < 0)
771                 return r;
772
773         r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
774         if (r < 0)
775                 return r;
776
777         /* The linking might have altered the window, so let's
778          * refresh our pointer */
779         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
780         if (r < 0)
781                 return r;
782
783         if (ret)
784                 *ret = o;
785
786         if (offset)
787                 *offset = p;
788
789         return 0;
790 }
791
792 uint64_t journal_file_entry_n_items(Object *o) {
793         assert(o);
794         assert(o->object.type == OBJECT_ENTRY);
795
796         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
797 }
798
799 static uint64_t journal_file_entry_array_n_items(Object *o) {
800         assert(o);
801         assert(o->object.type == OBJECT_ENTRY_ARRAY);
802
803         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
804 }
805
806 static int link_entry_into_array(JournalFile *f,
807                                  le64_t *first,
808                                  le64_t *idx,
809                                  uint64_t p) {
810         int r;
811         uint64_t n = 0, ap = 0, q, i, a, hidx;
812         Object *o;
813
814         assert(f);
815         assert(first);
816         assert(idx);
817         assert(p > 0);
818
819         a = le64toh(*first);
820         i = hidx = le64toh(*idx);
821         while (a > 0) {
822
823                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
824                 if (r < 0)
825                         return r;
826
827                 n = journal_file_entry_array_n_items(o);
828                 if (i < n) {
829                         o->entry_array.items[i] = htole64(p);
830                         *idx = htole64(hidx + 1);
831                         return 0;
832                 }
833
834                 i -= n;
835                 ap = a;
836                 a = le64toh(o->entry_array.next_entry_array_offset);
837         }
838
839         if (hidx > n)
840                 n = (hidx+1) * 2;
841         else
842                 n = n * 2;
843
844         if (n < 4)
845                 n = 4;
846
847         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
848                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
849                                        &o, &q);
850         if (r < 0)
851                 return r;
852
853         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
854         if (r < 0)
855                 return r;
856
857         o->entry_array.items[i] = htole64(p);
858
859         if (ap == 0)
860                 *first = htole64(q);
861         else {
862                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
863                 if (r < 0)
864                         return r;
865
866                 o->entry_array.next_entry_array_offset = htole64(q);
867         }
868
869         *idx = htole64(hidx + 1);
870
871         return 0;
872 }
873
874 static int link_entry_into_array_plus_one(JournalFile *f,
875                                           le64_t *extra,
876                                           le64_t *first,
877                                           le64_t *idx,
878                                           uint64_t p) {
879
880         int r;
881
882         assert(f);
883         assert(extra);
884         assert(first);
885         assert(idx);
886         assert(p > 0);
887
888         if (*idx == 0)
889                 *extra = htole64(p);
890         else {
891                 le64_t i;
892
893                 i = htole64(le64toh(*idx) - 1);
894                 r = link_entry_into_array(f, first, &i, p);
895                 if (r < 0)
896                         return r;
897         }
898
899         *idx = htole64(le64toh(*idx) + 1);
900         return 0;
901 }
902
903 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
904         uint64_t p;
905         int r;
906         assert(f);
907         assert(o);
908         assert(offset > 0);
909
910         p = le64toh(o->entry.items[i].object_offset);
911         if (p == 0)
912                 return -EINVAL;
913
914         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
915         if (r < 0)
916                 return r;
917
918         return link_entry_into_array_plus_one(f,
919                                               &o->data.entry_offset,
920                                               &o->data.entry_array_offset,
921                                               &o->data.n_entries,
922                                               offset);
923 }
924
925 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
926         uint64_t n, i;
927         int r;
928
929         assert(f);
930         assert(o);
931         assert(offset > 0);
932         assert(o->object.type == OBJECT_ENTRY);
933
934         __sync_synchronize();
935
936         /* Link up the entry itself */
937         r = link_entry_into_array(f,
938                                   &f->header->entry_array_offset,
939                                   &f->header->n_entries,
940                                   offset);
941         if (r < 0)
942                 return r;
943
944         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
945
946         if (f->header->head_entry_realtime == 0)
947                 f->header->head_entry_realtime = o->entry.realtime;
948
949         f->header->tail_entry_realtime = o->entry.realtime;
950         f->header->tail_entry_monotonic = o->entry.monotonic;
951
952         f->tail_entry_monotonic_valid = true;
953
954         /* Link up the items */
955         n = journal_file_entry_n_items(o);
956         for (i = 0; i < n; i++) {
957                 r = journal_file_link_entry_item(f, o, offset, i);
958                 if (r < 0)
959                         return r;
960         }
961
962         return 0;
963 }
964
965 static int journal_file_append_entry_internal(
966                 JournalFile *f,
967                 const dual_timestamp *ts,
968                 uint64_t xor_hash,
969                 const EntryItem items[], unsigned n_items,
970                 uint64_t *seqnum,
971                 Object **ret, uint64_t *offset) {
972         uint64_t np;
973         uint64_t osize;
974         Object *o;
975         int r;
976
977         assert(f);
978         assert(items || n_items == 0);
979         assert(ts);
980
981         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
982
983         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
984         if (r < 0)
985                 return r;
986
987         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
988         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
989         o->entry.realtime = htole64(ts->realtime);
990         o->entry.monotonic = htole64(ts->monotonic);
991         o->entry.xor_hash = htole64(xor_hash);
992         o->entry.boot_id = f->header->boot_id;
993
994         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
995         if (r < 0)
996                 return r;
997
998         r = journal_file_link_entry(f, o, np);
999         if (r < 0)
1000                 return r;
1001
1002         if (ret)
1003                 *ret = o;
1004
1005         if (offset)
1006                 *offset = np;
1007
1008         return 0;
1009 }
1010
1011 void journal_file_post_change(JournalFile *f) {
1012         assert(f);
1013
1014         /* inotify() does not receive IN_MODIFY events from file
1015          * accesses done via mmap(). After each access we hence
1016          * trigger IN_MODIFY by truncating the journal file to its
1017          * current size which triggers IN_MODIFY. */
1018
1019         __sync_synchronize();
1020
1021         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1022                 log_error("Failed to to truncate file to its own size: %m");
1023 }
1024
1025 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1026         unsigned i;
1027         EntryItem *items;
1028         int r;
1029         uint64_t xor_hash = 0;
1030         struct dual_timestamp _ts;
1031
1032         assert(f);
1033         assert(iovec || n_iovec == 0);
1034
1035         if (!f->writable)
1036                 return -EPERM;
1037
1038         if (!ts) {
1039                 dual_timestamp_get(&_ts);
1040                 ts = &_ts;
1041         }
1042
1043         if (f->tail_entry_monotonic_valid &&
1044             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1045                 return -EINVAL;
1046
1047         r = journal_file_maybe_append_tag(f, ts->realtime);
1048         if (r < 0)
1049                 return r;
1050
1051         /* alloca() can't take 0, hence let's allocate at least one */
1052         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1053
1054         for (i = 0; i < n_iovec; i++) {
1055                 uint64_t p;
1056                 Object *o;
1057
1058                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1059                 if (r < 0)
1060                         return r;
1061
1062                 xor_hash ^= le64toh(o->data.hash);
1063                 items[i].object_offset = htole64(p);
1064                 items[i].hash = o->data.hash;
1065         }
1066
1067         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1068
1069         journal_file_post_change(f);
1070
1071         return r;
1072 }
1073
1074 static int generic_array_get(JournalFile *f,
1075                              uint64_t first,
1076                              uint64_t i,
1077                              Object **ret, uint64_t *offset) {
1078
1079         Object *o;
1080         uint64_t p = 0, a;
1081         int r;
1082
1083         assert(f);
1084
1085         a = first;
1086         while (a > 0) {
1087                 uint64_t n;
1088
1089                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1090                 if (r < 0)
1091                         return r;
1092
1093                 n = journal_file_entry_array_n_items(o);
1094                 if (i < n) {
1095                         p = le64toh(o->entry_array.items[i]);
1096                         break;
1097                 }
1098
1099                 i -= n;
1100                 a = le64toh(o->entry_array.next_entry_array_offset);
1101         }
1102
1103         if (a <= 0 || p <= 0)
1104                 return 0;
1105
1106         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1107         if (r < 0)
1108                 return r;
1109
1110         if (ret)
1111                 *ret = o;
1112
1113         if (offset)
1114                 *offset = p;
1115
1116         return 1;
1117 }
1118
1119 static int generic_array_get_plus_one(JournalFile *f,
1120                                       uint64_t extra,
1121                                       uint64_t first,
1122                                       uint64_t i,
1123                                       Object **ret, uint64_t *offset) {
1124
1125         Object *o;
1126
1127         assert(f);
1128
1129         if (i == 0) {
1130                 int r;
1131
1132                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1133                 if (r < 0)
1134                         return r;
1135
1136                 if (ret)
1137                         *ret = o;
1138
1139                 if (offset)
1140                         *offset = extra;
1141
1142                 return 1;
1143         }
1144
1145         return generic_array_get(f, first, i-1, ret, offset);
1146 }
1147
1148 enum {
1149         TEST_FOUND,
1150         TEST_LEFT,
1151         TEST_RIGHT
1152 };
1153
1154 static int generic_array_bisect(JournalFile *f,
1155                                 uint64_t first,
1156                                 uint64_t n,
1157                                 uint64_t needle,
1158                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1159                                 direction_t direction,
1160                                 Object **ret,
1161                                 uint64_t *offset,
1162                                 uint64_t *idx) {
1163
1164         uint64_t a, p, t = 0, i = 0, last_p = 0;
1165         bool subtract_one = false;
1166         Object *o, *array = NULL;
1167         int r;
1168
1169         assert(f);
1170         assert(test_object);
1171
1172         a = first;
1173         while (a > 0) {
1174                 uint64_t left, right, k, lp;
1175
1176                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1177                 if (r < 0)
1178                         return r;
1179
1180                 k = journal_file_entry_array_n_items(array);
1181                 right = MIN(k, n);
1182                 if (right <= 0)
1183                         return 0;
1184
1185                 i = right - 1;
1186                 lp = p = le64toh(array->entry_array.items[i]);
1187                 if (p <= 0)
1188                         return -EBADMSG;
1189
1190                 r = test_object(f, p, needle);
1191                 if (r < 0)
1192                         return r;
1193
1194                 if (r == TEST_FOUND)
1195                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1196
1197                 if (r == TEST_RIGHT) {
1198                         left = 0;
1199                         right -= 1;
1200                         for (;;) {
1201                                 if (left == right) {
1202                                         if (direction == DIRECTION_UP)
1203                                                 subtract_one = true;
1204
1205                                         i = left;
1206                                         goto found;
1207                                 }
1208
1209                                 assert(left < right);
1210
1211                                 i = (left + right) / 2;
1212                                 p = le64toh(array->entry_array.items[i]);
1213                                 if (p <= 0)
1214                                         return -EBADMSG;
1215
1216                                 r = test_object(f, p, needle);
1217                                 if (r < 0)
1218                                         return r;
1219
1220                                 if (r == TEST_FOUND)
1221                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1222
1223                                 if (r == TEST_RIGHT)
1224                                         right = i;
1225                                 else
1226                                         left = i + 1;
1227                         }
1228                 }
1229
1230                 if (k > n) {
1231                         if (direction == DIRECTION_UP) {
1232                                 i = n;
1233                                 subtract_one = true;
1234                                 goto found;
1235                         }
1236
1237                         return 0;
1238                 }
1239
1240                 last_p = lp;
1241
1242                 n -= k;
1243                 t += k;
1244                 a = le64toh(array->entry_array.next_entry_array_offset);
1245         }
1246
1247         return 0;
1248
1249 found:
1250         if (subtract_one && t == 0 && i == 0)
1251                 return 0;
1252
1253         if (subtract_one && i == 0)
1254                 p = last_p;
1255         else if (subtract_one)
1256                 p = le64toh(array->entry_array.items[i-1]);
1257         else
1258                 p = le64toh(array->entry_array.items[i]);
1259
1260         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1261         if (r < 0)
1262                 return r;
1263
1264         if (ret)
1265                 *ret = o;
1266
1267         if (offset)
1268                 *offset = p;
1269
1270         if (idx)
1271                 *idx = t + i + (subtract_one ? -1 : 0);
1272
1273         return 1;
1274 }
1275
1276 static int generic_array_bisect_plus_one(JournalFile *f,
1277                                          uint64_t extra,
1278                                          uint64_t first,
1279                                          uint64_t n,
1280                                          uint64_t needle,
1281                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1282                                          direction_t direction,
1283                                          Object **ret,
1284                                          uint64_t *offset,
1285                                          uint64_t *idx) {
1286
1287         int r;
1288         bool step_back = false;
1289         Object *o;
1290
1291         assert(f);
1292         assert(test_object);
1293
1294         if (n <= 0)
1295                 return 0;
1296
1297         /* This bisects the array in object 'first', but first checks
1298          * an extra  */
1299         r = test_object(f, extra, needle);
1300         if (r < 0)
1301                 return r;
1302
1303         if (r == TEST_FOUND)
1304                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1305
1306         /* if we are looking with DIRECTION_UP then we need to first
1307            see if in the actual array there is a matching entry, and
1308            return the last one of that. But if there isn't any we need
1309            to return this one. Hence remember this, and return it
1310            below. */
1311         if (r == TEST_LEFT)
1312                 step_back = direction == DIRECTION_UP;
1313
1314         if (r == TEST_RIGHT) {
1315                 if (direction == DIRECTION_DOWN)
1316                         goto found;
1317                 else
1318                         return 0;
1319         }
1320
1321         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1322
1323         if (r == 0 && step_back)
1324                 goto found;
1325
1326         if (r > 0 && idx)
1327                 (*idx) ++;
1328
1329         return r;
1330
1331 found:
1332         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1333         if (r < 0)
1334                 return r;
1335
1336         if (ret)
1337                 *ret = o;
1338
1339         if (offset)
1340                 *offset = extra;
1341
1342         if (idx)
1343                 *idx = 0;
1344
1345         return 1;
1346 }
1347
1348 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1349         assert(f);
1350         assert(p > 0);
1351
1352         if (p == needle)
1353                 return TEST_FOUND;
1354         else if (p < needle)
1355                 return TEST_LEFT;
1356         else
1357                 return TEST_RIGHT;
1358 }
1359
1360 int journal_file_move_to_entry_by_offset(
1361                 JournalFile *f,
1362                 uint64_t p,
1363                 direction_t direction,
1364                 Object **ret,
1365                 uint64_t *offset) {
1366
1367         return generic_array_bisect(f,
1368                                     le64toh(f->header->entry_array_offset),
1369                                     le64toh(f->header->n_entries),
1370                                     p,
1371                                     test_object_offset,
1372                                     direction,
1373                                     ret, offset, NULL);
1374 }
1375
1376
1377 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1378         Object *o;
1379         int r;
1380
1381         assert(f);
1382         assert(p > 0);
1383
1384         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1385         if (r < 0)
1386                 return r;
1387
1388         if (le64toh(o->entry.seqnum) == needle)
1389                 return TEST_FOUND;
1390         else if (le64toh(o->entry.seqnum) < needle)
1391                 return TEST_LEFT;
1392         else
1393                 return TEST_RIGHT;
1394 }
1395
1396 int journal_file_move_to_entry_by_seqnum(
1397                 JournalFile *f,
1398                 uint64_t seqnum,
1399                 direction_t direction,
1400                 Object **ret,
1401                 uint64_t *offset) {
1402
1403         return generic_array_bisect(f,
1404                                     le64toh(f->header->entry_array_offset),
1405                                     le64toh(f->header->n_entries),
1406                                     seqnum,
1407                                     test_object_seqnum,
1408                                     direction,
1409                                     ret, offset, NULL);
1410 }
1411
1412 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1413         Object *o;
1414         int r;
1415
1416         assert(f);
1417         assert(p > 0);
1418
1419         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1420         if (r < 0)
1421                 return r;
1422
1423         if (le64toh(o->entry.realtime) == needle)
1424                 return TEST_FOUND;
1425         else if (le64toh(o->entry.realtime) < needle)
1426                 return TEST_LEFT;
1427         else
1428                 return TEST_RIGHT;
1429 }
1430
1431 int journal_file_move_to_entry_by_realtime(
1432                 JournalFile *f,
1433                 uint64_t realtime,
1434                 direction_t direction,
1435                 Object **ret,
1436                 uint64_t *offset) {
1437
1438         return generic_array_bisect(f,
1439                                     le64toh(f->header->entry_array_offset),
1440                                     le64toh(f->header->n_entries),
1441                                     realtime,
1442                                     test_object_realtime,
1443                                     direction,
1444                                     ret, offset, NULL);
1445 }
1446
1447 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1448         Object *o;
1449         int r;
1450
1451         assert(f);
1452         assert(p > 0);
1453
1454         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455         if (r < 0)
1456                 return r;
1457
1458         if (le64toh(o->entry.monotonic) == needle)
1459                 return TEST_FOUND;
1460         else if (le64toh(o->entry.monotonic) < needle)
1461                 return TEST_LEFT;
1462         else
1463                 return TEST_RIGHT;
1464 }
1465
1466 int journal_file_move_to_entry_by_monotonic(
1467                 JournalFile *f,
1468                 sd_id128_t boot_id,
1469                 uint64_t monotonic,
1470                 direction_t direction,
1471                 Object **ret,
1472                 uint64_t *offset) {
1473
1474         char t[9+32+1] = "_BOOT_ID=";
1475         Object *o;
1476         int r;
1477
1478         assert(f);
1479
1480         sd_id128_to_string(boot_id, t + 9);
1481         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1482         if (r < 0)
1483                 return r;
1484         if (r == 0)
1485                 return -ENOENT;
1486
1487         return generic_array_bisect_plus_one(f,
1488                                              le64toh(o->data.entry_offset),
1489                                              le64toh(o->data.entry_array_offset),
1490                                              le64toh(o->data.n_entries),
1491                                              monotonic,
1492                                              test_object_monotonic,
1493                                              direction,
1494                                              ret, offset, NULL);
1495 }
1496
1497 int journal_file_next_entry(
1498                 JournalFile *f,
1499                 Object *o, uint64_t p,
1500                 direction_t direction,
1501                 Object **ret, uint64_t *offset) {
1502
1503         uint64_t i, n;
1504         int r;
1505
1506         assert(f);
1507         assert(p > 0 || !o);
1508
1509         n = le64toh(f->header->n_entries);
1510         if (n <= 0)
1511                 return 0;
1512
1513         if (!o)
1514                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1515         else {
1516                 if (o->object.type != OBJECT_ENTRY)
1517                         return -EINVAL;
1518
1519                 r = generic_array_bisect(f,
1520                                          le64toh(f->header->entry_array_offset),
1521                                          le64toh(f->header->n_entries),
1522                                          p,
1523                                          test_object_offset,
1524                                          DIRECTION_DOWN,
1525                                          NULL, NULL,
1526                                          &i);
1527                 if (r <= 0)
1528                         return r;
1529
1530                 if (direction == DIRECTION_DOWN) {
1531                         if (i >= n - 1)
1532                                 return 0;
1533
1534                         i++;
1535                 } else {
1536                         if (i <= 0)
1537                                 return 0;
1538
1539                         i--;
1540                 }
1541         }
1542
1543         /* And jump to it */
1544         return generic_array_get(f,
1545                                  le64toh(f->header->entry_array_offset),
1546                                  i,
1547                                  ret, offset);
1548 }
1549
1550 int journal_file_skip_entry(
1551                 JournalFile *f,
1552                 Object *o, uint64_t p,
1553                 int64_t skip,
1554                 Object **ret, uint64_t *offset) {
1555
1556         uint64_t i, n;
1557         int r;
1558
1559         assert(f);
1560         assert(o);
1561         assert(p > 0);
1562
1563         if (o->object.type != OBJECT_ENTRY)
1564                 return -EINVAL;
1565
1566         r = generic_array_bisect(f,
1567                                  le64toh(f->header->entry_array_offset),
1568                                  le64toh(f->header->n_entries),
1569                                  p,
1570                                  test_object_offset,
1571                                  DIRECTION_DOWN,
1572                                  NULL, NULL,
1573                                  &i);
1574         if (r <= 0)
1575                 return r;
1576
1577         /* Calculate new index */
1578         if (skip < 0) {
1579                 if ((uint64_t) -skip >= i)
1580                         i = 0;
1581                 else
1582                         i = i - (uint64_t) -skip;
1583         } else
1584                 i  += (uint64_t) skip;
1585
1586         n = le64toh(f->header->n_entries);
1587         if (n <= 0)
1588                 return -EBADMSG;
1589
1590         if (i >= n)
1591                 i = n-1;
1592
1593         return generic_array_get(f,
1594                                  le64toh(f->header->entry_array_offset),
1595                                  i,
1596                                  ret, offset);
1597 }
1598
1599 int journal_file_next_entry_for_data(
1600                 JournalFile *f,
1601                 Object *o, uint64_t p,
1602                 uint64_t data_offset,
1603                 direction_t direction,
1604                 Object **ret, uint64_t *offset) {
1605
1606         uint64_t n, i;
1607         int r;
1608         Object *d;
1609
1610         assert(f);
1611         assert(p > 0 || !o);
1612
1613         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1614         if (r < 0)
1615                 return r;
1616
1617         n = le64toh(d->data.n_entries);
1618         if (n <= 0)
1619                 return n;
1620
1621         if (!o)
1622                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1623         else {
1624                 if (o->object.type != OBJECT_ENTRY)
1625                         return -EINVAL;
1626
1627                 r = generic_array_bisect_plus_one(f,
1628                                                   le64toh(d->data.entry_offset),
1629                                                   le64toh(d->data.entry_array_offset),
1630                                                   le64toh(d->data.n_entries),
1631                                                   p,
1632                                                   test_object_offset,
1633                                                   DIRECTION_DOWN,
1634                                                   NULL, NULL,
1635                                                   &i);
1636
1637                 if (r <= 0)
1638                         return r;
1639
1640                 if (direction == DIRECTION_DOWN) {
1641                         if (i >= n - 1)
1642                                 return 0;
1643
1644                         i++;
1645                 } else {
1646                         if (i <= 0)
1647                                 return 0;
1648
1649                         i--;
1650                 }
1651
1652         }
1653
1654         return generic_array_get_plus_one(f,
1655                                           le64toh(d->data.entry_offset),
1656                                           le64toh(d->data.entry_array_offset),
1657                                           i,
1658                                           ret, offset);
1659 }
1660
1661 int journal_file_move_to_entry_by_offset_for_data(
1662                 JournalFile *f,
1663                 uint64_t data_offset,
1664                 uint64_t p,
1665                 direction_t direction,
1666                 Object **ret, uint64_t *offset) {
1667
1668         int r;
1669         Object *d;
1670
1671         assert(f);
1672
1673         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1674         if (r < 0)
1675                 return r;
1676
1677         return generic_array_bisect_plus_one(f,
1678                                              le64toh(d->data.entry_offset),
1679                                              le64toh(d->data.entry_array_offset),
1680                                              le64toh(d->data.n_entries),
1681                                              p,
1682                                              test_object_offset,
1683                                              direction,
1684                                              ret, offset, NULL);
1685 }
1686
1687 int journal_file_move_to_entry_by_monotonic_for_data(
1688                 JournalFile *f,
1689                 uint64_t data_offset,
1690                 sd_id128_t boot_id,
1691                 uint64_t monotonic,
1692                 direction_t direction,
1693                 Object **ret, uint64_t *offset) {
1694
1695         char t[9+32+1] = "_BOOT_ID=";
1696         Object *o, *d;
1697         int r;
1698         uint64_t b, z;
1699
1700         assert(f);
1701
1702         /* First, seek by time */
1703         sd_id128_to_string(boot_id, t + 9);
1704         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1705         if (r < 0)
1706                 return r;
1707         if (r == 0)
1708                 return -ENOENT;
1709
1710         r = generic_array_bisect_plus_one(f,
1711                                           le64toh(o->data.entry_offset),
1712                                           le64toh(o->data.entry_array_offset),
1713                                           le64toh(o->data.n_entries),
1714                                           monotonic,
1715                                           test_object_monotonic,
1716                                           direction,
1717                                           NULL, &z, NULL);
1718         if (r <= 0)
1719                 return r;
1720
1721         /* And now, continue seeking until we find an entry that
1722          * exists in both bisection arrays */
1723
1724         for (;;) {
1725                 Object *qo;
1726                 uint64_t p, q;
1727
1728                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1729                 if (r < 0)
1730                         return r;
1731
1732                 r = generic_array_bisect_plus_one(f,
1733                                                   le64toh(d->data.entry_offset),
1734                                                   le64toh(d->data.entry_array_offset),
1735                                                   le64toh(d->data.n_entries),
1736                                                   z,
1737                                                   test_object_offset,
1738                                                   direction,
1739                                                   NULL, &p, NULL);
1740                 if (r <= 0)
1741                         return r;
1742
1743                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1744                 if (r < 0)
1745                         return r;
1746
1747                 r = generic_array_bisect_plus_one(f,
1748                                                   le64toh(o->data.entry_offset),
1749                                                   le64toh(o->data.entry_array_offset),
1750                                                   le64toh(o->data.n_entries),
1751                                                   p,
1752                                                   test_object_offset,
1753                                                   direction,
1754                                                   &qo, &q, NULL);
1755
1756                 if (r <= 0)
1757                         return r;
1758
1759                 if (p == q) {
1760                         if (ret)
1761                                 *ret = qo;
1762                         if (offset)
1763                                 *offset = q;
1764
1765                         return 1;
1766                 }
1767
1768                 z = q;
1769         }
1770
1771         return 0;
1772 }
1773
1774 int journal_file_move_to_entry_by_seqnum_for_data(
1775                 JournalFile *f,
1776                 uint64_t data_offset,
1777                 uint64_t seqnum,
1778                 direction_t direction,
1779                 Object **ret, uint64_t *offset) {
1780
1781         Object *d;
1782         int r;
1783
1784         assert(f);
1785
1786         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1787         if (r < 0)
1788                 return r;
1789
1790         return generic_array_bisect_plus_one(f,
1791                                              le64toh(d->data.entry_offset),
1792                                              le64toh(d->data.entry_array_offset),
1793                                              le64toh(d->data.n_entries),
1794                                              seqnum,
1795                                              test_object_seqnum,
1796                                              direction,
1797                                              ret, offset, NULL);
1798 }
1799
1800 int journal_file_move_to_entry_by_realtime_for_data(
1801                 JournalFile *f,
1802                 uint64_t data_offset,
1803                 uint64_t realtime,
1804                 direction_t direction,
1805                 Object **ret, uint64_t *offset) {
1806
1807         Object *d;
1808         int r;
1809
1810         assert(f);
1811
1812         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1813         if (r < 0)
1814                 return r;
1815
1816         return generic_array_bisect_plus_one(f,
1817                                              le64toh(d->data.entry_offset),
1818                                              le64toh(d->data.entry_array_offset),
1819                                              le64toh(d->data.n_entries),
1820                                              realtime,
1821                                              test_object_realtime,
1822                                              direction,
1823                                              ret, offset, NULL);
1824 }
1825
1826 static void *fsprg_state(JournalFile *f) {
1827         uint64_t a, b;
1828         assert(f);
1829
1830         if (!f->authenticate)
1831                 return NULL;
1832
1833         a = le64toh(f->fsprg_header->header_size);
1834         b = le64toh(f->fsprg_header->state_size);
1835
1836         if (a + b > f->fsprg_size)
1837                 return NULL;
1838
1839         return (uint8_t*) f->fsprg_header + a;
1840 }
1841
1842 static uint64_t journal_file_tag_seqnum(JournalFile *f) {
1843         uint64_t r;
1844
1845         assert(f);
1846
1847         r = le64toh(f->header->n_tags) + 1;
1848         f->header->n_tags = htole64(r);
1849
1850         return r;
1851 }
1852
1853 int journal_file_append_tag(JournalFile *f) {
1854         Object *o;
1855         uint64_t p;
1856         int r;
1857
1858         assert(f);
1859
1860         if (!f->authenticate)
1861                 return 0;
1862
1863         if (!f->hmac_running)
1864                 return 0;
1865
1866         log_debug("Writing tag for epoch %llu\n", (unsigned long long) FSPRG_GetEpoch(fsprg_state(f)));
1867
1868         assert(f->hmac);
1869
1870         r = journal_file_append_object(f, OBJECT_TAG, sizeof(struct TagObject), &o, &p);
1871         if (r < 0)
1872                 return r;
1873
1874         o->tag.seqnum = htole64(journal_file_tag_seqnum(f));
1875
1876         /* Add the tag object itself, so that we can protect its
1877          * header. This will exclude the actual hash value in it */
1878         r = journal_file_hmac_put_object(f, OBJECT_TAG, p);
1879         if (r < 0)
1880                 return r;
1881
1882         /* Get the HMAC tag and store it in the object */
1883         memcpy(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH);
1884         f->hmac_running = false;
1885
1886         return 0;
1887 }
1888
1889 static int journal_file_hmac_start(JournalFile *f) {
1890         uint8_t key[256 / 8]; /* Let's pass 256 bit from FSPRG to HMAC */
1891
1892         assert(f);
1893
1894         if (!f->authenticate)
1895                 return 0;
1896
1897         if (f->hmac_running)
1898                 return 0;
1899
1900         /* Prepare HMAC for next cycle */
1901         gcry_md_reset(f->hmac);
1902         FSPRG_GetKey(fsprg_state(f), key, sizeof(key), 0);
1903         gcry_md_setkey(f->hmac, key, sizeof(key));
1904
1905         f->hmac_running = true;
1906
1907         return 0;
1908 }
1909
1910 static int journal_file_get_epoch(JournalFile *f, uint64_t realtime, uint64_t *epoch) {
1911         uint64_t t;
1912
1913         assert(f);
1914         assert(epoch);
1915         assert(f->authenticate);
1916
1917         if (le64toh(f->fsprg_header->fsprg_start_usec) == 0 ||
1918             le64toh(f->fsprg_header->fsprg_interval_usec) == 0)
1919                 return -ENOTSUP;
1920
1921         if (realtime < le64toh(f->fsprg_header->fsprg_start_usec))
1922                 return -ESTALE;
1923
1924         t = realtime - le64toh(f->fsprg_header->fsprg_start_usec);
1925         t = t / le64toh(f->fsprg_header->fsprg_interval_usec);
1926
1927         *epoch = t;
1928         return 0;
1929 }
1930
1931 static int journal_file_need_evolve(JournalFile *f, uint64_t realtime) {
1932         uint64_t goal, epoch;
1933         int r;
1934         assert(f);
1935
1936         if (!f->authenticate)
1937                 return 0;
1938
1939         r = journal_file_get_epoch(f, realtime, &goal);
1940         if (r < 0)
1941                 return r;
1942
1943         epoch = FSPRG_GetEpoch(fsprg_state(f));
1944         if (epoch > goal)
1945                 return -ESTALE;
1946
1947         return epoch != goal;
1948 }
1949
1950 static int journal_file_evolve(JournalFile *f, uint64_t realtime) {
1951         uint64_t goal, epoch;
1952         int r;
1953
1954         assert(f);
1955
1956         if (!f->authenticate)
1957                 return 0;
1958
1959         r = journal_file_get_epoch(f, realtime, &goal);
1960         if (r < 0)
1961                 return r;
1962
1963         epoch = FSPRG_GetEpoch(fsprg_state(f));
1964         if (epoch < goal)
1965                 log_debug("Evolving FSPRG key from epoch %llu to %llu.", (unsigned long long) epoch, (unsigned long long) goal);
1966
1967         for (;;) {
1968                 if (epoch > goal)
1969                         return -ESTALE;
1970                 if (epoch == goal)
1971                         return 0;
1972
1973                 FSPRG_Evolve(fsprg_state(f));
1974                 epoch = FSPRG_GetEpoch(fsprg_state(f));
1975         }
1976 }
1977
1978 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime) {
1979         int r;
1980
1981         assert(f);
1982
1983         if (!f->authenticate)
1984                 return 0;
1985
1986         r = journal_file_need_evolve(f, realtime);
1987         if (r <= 0)
1988                 return 0;
1989
1990         r = journal_file_append_tag(f);
1991         if (r < 0)
1992                 return r;
1993
1994         r = journal_file_evolve(f, realtime);
1995         if (r < 0)
1996                 return r;
1997
1998         r = journal_file_hmac_start(f);
1999         if (r < 0)
2000                 return r;
2001
2002         return 0;
2003 }
2004
2005 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p) {
2006         int r;
2007         Object *o;
2008
2009         assert(f);
2010
2011         if (!f->authenticate)
2012                 return 0;
2013
2014         r = journal_file_hmac_start(f);
2015         if (r < 0)
2016                 return r;
2017
2018         r = journal_file_move_to_object(f, type, p, &o);
2019         if (r < 0)
2020                 return r;
2021
2022         gcry_md_write(f->hmac, o, offsetof(ObjectHeader, payload));
2023
2024         switch (o->object.type) {
2025
2026         case OBJECT_DATA:
2027                 /* All but: hash and payload are mutable */
2028                 gcry_md_write(f->hmac, &o->data.hash, sizeof(o->data.hash));
2029                 gcry_md_write(f->hmac, o->data.payload, le64toh(o->object.size) - offsetof(DataObject, payload));
2030                 break;
2031
2032         case OBJECT_ENTRY:
2033                 /* All */
2034                 gcry_md_write(f->hmac, &o->entry.seqnum, le64toh(o->object.size) - offsetof(EntryObject, seqnum));
2035                 break;
2036
2037         case OBJECT_FIELD_HASH_TABLE:
2038         case OBJECT_DATA_HASH_TABLE:
2039         case OBJECT_ENTRY_ARRAY:
2040                 /* Nothing: everything is mutable */
2041                 break;
2042
2043         case OBJECT_TAG:
2044                 /* All but the tag itself */
2045                 gcry_md_write(f->hmac, &o->tag.seqnum, sizeof(o->tag.seqnum));
2046                 break;
2047         default:
2048                 return -EINVAL;
2049         }
2050
2051         return 0;
2052 }
2053
2054 static int journal_file_hmac_put_header(JournalFile *f) {
2055         int r;
2056
2057         assert(f);
2058
2059         if (!f->authenticate)
2060                 return 0;
2061
2062         r = journal_file_hmac_start(f);
2063         if (r < 0)
2064                 return r;
2065
2066         /* All but state+reserved, boot_id, arena_size,
2067          * tail_object_offset, n_objects, n_entries, tail_seqnum,
2068          * head_entry_realtime, tail_entry_realtime,
2069          * tail_entry_monotonic, n_data, n_fields, header_tag */
2070
2071         gcry_md_write(f->hmac, f->header->signature, offsetof(Header, state) - offsetof(Header, signature));
2072         gcry_md_write(f->hmac, &f->header->file_id, offsetof(Header, boot_id) - offsetof(Header, file_id));
2073         gcry_md_write(f->hmac, &f->header->seqnum_id, offsetof(Header, arena_size) - offsetof(Header, seqnum_id));
2074         gcry_md_write(f->hmac, &f->header->data_hash_table_offset, offsetof(Header, tail_object_offset) - offsetof(Header, data_hash_table_offset));
2075         gcry_md_write(f->hmac, &f->header->head_entry_seqnum, offsetof(Header, head_entry_realtime) - offsetof(Header, head_entry_seqnum));
2076
2077         return 0;
2078 }
2079
2080 static int journal_file_load_fsprg(JournalFile *f) {
2081         int r, fd = -1;
2082         char *p = NULL;
2083         struct stat st;
2084         FSPRGHeader *m = NULL;
2085         sd_id128_t machine;
2086
2087         assert(f);
2088
2089         if (!f->authenticate)
2090                 return 0;
2091
2092         r = sd_id128_get_machine(&machine);
2093         if (r < 0)
2094                 return r;
2095
2096         if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/fsprg",
2097                      SD_ID128_FORMAT_VAL(machine)) < 0)
2098                 return -ENOMEM;
2099
2100         fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY, 0600);
2101         if (fd < 0) {
2102                 log_error("Failed to open %s: %m", p);
2103                 r = -errno;
2104                 goto finish;
2105         }
2106
2107         if (fstat(fd, &st) < 0) {
2108                 r = -errno;
2109                 goto finish;
2110         }
2111
2112         if (st.st_size < (off_t) sizeof(FSPRGHeader)) {
2113                 r = -ENODATA;
2114                 goto finish;
2115         }
2116
2117         m = mmap(NULL, PAGE_ALIGN(sizeof(FSPRGHeader)), PROT_READ, MAP_SHARED, fd, 0);
2118         if (m == MAP_FAILED) {
2119                 m = NULL;
2120                 r = -errno;
2121                 goto finish;
2122         }
2123
2124         if (memcmp(m->signature, FSPRG_HEADER_SIGNATURE, 8) != 0) {
2125                 r = -EBADMSG;
2126                 goto finish;
2127         }
2128
2129         if (m->incompatible_flags != 0) {
2130                 r = -EPROTONOSUPPORT;
2131                 goto finish;
2132         }
2133
2134         if (le64toh(m->header_size) < sizeof(FSPRGHeader)) {
2135                 r = -EBADMSG;
2136                 goto finish;
2137         }
2138
2139         if (le64toh(m->state_size) != FSPRG_stateinbytes(m->secpar)) {
2140                 r = -EBADMSG;
2141                 goto finish;
2142         }
2143
2144         f->fsprg_size = le64toh(m->header_size) + le64toh(m->state_size);
2145         if ((uint64_t) st.st_size < f->fsprg_size) {
2146                 r = -ENODATA;
2147                 goto finish;
2148         }
2149
2150         if (!sd_id128_equal(machine, m->machine_id)) {
2151                 r = -EHOSTDOWN;
2152                 goto finish;
2153         }
2154
2155         if (le64toh(m->fsprg_start_usec) <= 0 ||
2156             le64toh(m->fsprg_interval_usec) <= 0) {
2157                 r = -EBADMSG;
2158                 goto finish;
2159         }
2160
2161         f->fsprg_header = mmap(NULL, PAGE_ALIGN(f->fsprg_size), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2162         if (f->fsprg_header == MAP_FAILED) {
2163                 f->fsprg_header = NULL;
2164                 r = -errno;
2165                 goto finish;
2166         }
2167
2168         r = 0;
2169
2170 finish:
2171         if (m)
2172                 munmap(m, PAGE_ALIGN(sizeof(FSPRGHeader)));
2173
2174         if (fd >= 0)
2175                 close_nointr_nofail(fd);
2176
2177         free(p);
2178         return r;
2179 }
2180
2181 static int journal_file_setup_hmac(JournalFile *f) {
2182         gcry_error_t e;
2183
2184         if (!f->authenticate)
2185                 return 0;
2186
2187         e = gcry_md_open(&f->hmac, GCRY_MD_SHA256, GCRY_MD_FLAG_HMAC);
2188         if (e != 0)
2189                 return -ENOTSUP;
2190
2191         return 0;
2192 }
2193
2194 static int journal_file_append_first_tag(JournalFile *f) {
2195         int r;
2196         uint64_t p;
2197
2198         if (!f->authenticate)
2199                 return 0;
2200
2201         log_debug("Calculating first tag...");
2202
2203         r = journal_file_hmac_put_header(f);
2204         if (r < 0)
2205                 return r;
2206
2207         p = le64toh(f->header->field_hash_table_offset);
2208         if (p < offsetof(Object, hash_table.items))
2209                 return -EINVAL;
2210         p -= offsetof(Object, hash_table.items);
2211
2212         r = journal_file_hmac_put_object(f, OBJECT_FIELD_HASH_TABLE, p);
2213         if (r < 0)
2214                 return r;
2215
2216         p = le64toh(f->header->data_hash_table_offset);
2217         if (p < offsetof(Object, hash_table.items))
2218                 return -EINVAL;
2219         p -= offsetof(Object, hash_table.items);
2220
2221         r = journal_file_hmac_put_object(f, OBJECT_DATA_HASH_TABLE, p);
2222         if (r < 0)
2223                 return r;
2224
2225         r = journal_file_append_tag(f);
2226         if (r < 0)
2227                 return r;
2228
2229         return 0;
2230 }
2231
2232 static int journal_file_object_verify(JournalFile *f, Object *o) {
2233         assert(f);
2234         assert(o);
2235
2236         /* This does various superficial tests about the length an
2237          * possible field values. It does not follow any references to
2238          * other objects. */
2239
2240         switch (o->object.type) {
2241         case OBJECT_DATA:
2242                 if (le64toh(o->data.entry_offset) <= 0 ||
2243                     le64toh(o->data.n_entries) <= 0)
2244                         return -EBADMSG;
2245
2246                 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0)
2247                         return -EBADMSG;
2248                 break;
2249
2250         case OBJECT_FIELD:
2251                 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0)
2252                         return -EBADMSG;
2253                 break;
2254
2255         case OBJECT_ENTRY:
2256                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
2257                         return -EBADMSG;
2258
2259                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
2260                         return -EBADMSG;
2261
2262                 if (le64toh(o->entry.seqnum) <= 0 ||
2263                     le64toh(o->entry.realtime) <= 0)
2264                         return -EBADMSG;
2265
2266                 break;
2267
2268         case OBJECT_DATA_HASH_TABLE:
2269         case OBJECT_FIELD_HASH_TABLE:
2270                 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0)
2271                         return -EBADMSG;
2272
2273                 break;
2274
2275         case OBJECT_ENTRY_ARRAY:
2276                 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0)
2277                         return -EBADMSG;
2278
2279                 break;
2280
2281         case OBJECT_TAG:
2282                 if (le64toh(o->object.size) != sizeof(TagObject))
2283                         return -EBADMSG;
2284                 break;
2285         }
2286
2287         return 0;
2288 }
2289
2290 static void draw_progress(uint64_t p, usec_t *last_usec) {
2291         unsigned n, i, j, k;
2292         usec_t z, x;
2293
2294         if (!isatty(STDOUT_FILENO))
2295                 return;
2296
2297         z = now(CLOCK_MONOTONIC);
2298         x = *last_usec;
2299
2300         if (x != 0 && x + 40 * USEC_PER_MSEC > z)
2301                 return;
2302
2303         *last_usec = z;
2304
2305         n = (3 * columns()) / 4;
2306         j = (n * (unsigned) p) / 65535ULL;
2307         k = n - j;
2308
2309         fputs("\r\x1B[?25l", stdout);
2310
2311         for (i = 0; i < j; i++)
2312                 fputs("\xe2\x96\x88", stdout);
2313
2314         for (i = 0; i < k; i++)
2315                 fputs("\xe2\x96\x91", stdout);
2316
2317         printf(" %3lu%%", 100LU * (unsigned long) p / 65535LU);
2318
2319         fputs("\r\x1B[?25h", stdout);
2320         fflush(stdout);
2321 }
2322
2323 static void flush_progress(void) {
2324         unsigned n, i;
2325
2326         if (!isatty(STDOUT_FILENO))
2327                 return;
2328
2329         n = (3 * columns()) / 4;
2330
2331         putchar('\r');
2332
2333         for (i = 0; i < n + 5; i++)
2334                 putchar(' ');
2335
2336         putchar('\r');
2337         fflush(stdout);
2338 }
2339
2340 static int write_uint64(int fd, uint64_t p) {
2341         ssize_t k;
2342
2343         k = write(fd, &p, sizeof(p));
2344         if (k < 0)
2345                 return -errno;
2346         if (k != sizeof(p))
2347                 return -EIO;
2348
2349         return 0;
2350 }
2351
2352 static int contains_uint64(MMapCache *m, int fd, uint64_t n, uint64_t p) {
2353         uint64_t a, b;
2354         int r;
2355
2356         assert(m);
2357         assert(fd >= 0);
2358
2359         /* Bisection ... */
2360
2361         a = 0; b = n;
2362         while (a < b) {
2363                 uint64_t c, *z;
2364
2365                 c = (a + b) / 2;
2366
2367                 r = mmap_cache_get(m, fd, PROT_READ, 0, c * sizeof(uint64_t), sizeof(uint64_t), (void **) &z);
2368                 if (r < 0)
2369                         return r;
2370
2371                 if (*z == p)
2372                         return 1;
2373
2374                 if (p < *z)
2375                         b = c;
2376                 else
2377                         a = c;
2378         }
2379
2380         return 0;
2381 }
2382
2383 int journal_file_verify(JournalFile *f, const char *key) {
2384         int r;
2385         Object *o;
2386         uint64_t p = 0;
2387         uint64_t tag_seqnum = 0, entry_seqnum = 0, entry_monotonic = 0, entry_realtime = 0;
2388         sd_id128_t entry_boot_id;
2389         bool entry_seqnum_set = false, entry_monotonic_set = false, entry_realtime_set = false, found_main_entry_array = false;
2390         uint64_t n_weird = 0, n_objects = 0, n_entries = 0, n_data = 0, n_fields = 0, n_data_hash_tables = 0, n_field_hash_tables = 0, n_entry_arrays = 0;
2391         usec_t last_usec = 0;
2392         int data_fd = -1, entry_fd = -1, entry_array_fd = -1;
2393         char data_path[] = "/var/tmp/journal-data-XXXXXX",
2394                 entry_path[] = "/var/tmp/journal-entry-XXXXXX",
2395                 entry_array_path[] = "/var/tmp/journal-entry-array-XXXXXX";
2396
2397         assert(f);
2398
2399         data_fd = mkostemp(data_path, O_CLOEXEC);
2400         if (data_fd < 0) {
2401                 log_error("Failed to create data file: %m");
2402                 goto fail;
2403         }
2404         unlink(data_path);
2405
2406         entry_fd = mkostemp(entry_path, O_CLOEXEC);
2407         if (entry_fd < 0) {
2408                 log_error("Failed to create entry file: %m");
2409                 goto fail;
2410         }
2411         unlink(entry_path);
2412
2413         entry_array_fd = mkostemp(entry_array_path, O_CLOEXEC);
2414         if (entry_array_fd < 0) {
2415                 log_error("Failed to create entry array file: %m");
2416                 goto fail;
2417         }
2418         unlink(entry_array_path);
2419
2420         /* First iteration: we go through all objects, verify the
2421          * superficial structure, headers, hashes. */
2422
2423         r = journal_file_hmac_put_header(f);
2424         if (r < 0) {
2425                 log_error("Failed to calculate HMAC of header.");
2426                 goto fail;
2427         }
2428
2429         p = le64toh(f->header->header_size);
2430         while (p != 0) {
2431                 draw_progress((0x7FFF * p) / le64toh(f->header->tail_object_offset), &last_usec);
2432
2433                 r = journal_file_move_to_object(f, -1, p, &o);
2434                 if (r < 0) {
2435                         log_error("Invalid object at %llu", (unsigned long long) p);
2436                         goto fail;
2437                 }
2438
2439                 if (le64toh(f->header->tail_object_offset) < p) {
2440                         log_error("Invalid tail object pointer.");
2441                         r = -EBADMSG;
2442                         goto fail;
2443                 }
2444
2445                 n_objects ++;
2446
2447                 r = journal_file_object_verify(f, o);
2448                 if (r < 0) {
2449                         log_error("Invalid object contents at %llu", (unsigned long long) p);
2450                         goto fail;
2451                 }
2452
2453                 r = journal_file_hmac_put_object(f, -1, p);
2454                 if (r < 0) {
2455                         log_error("Failed to calculate HMAC at %llu", (unsigned long long) p);
2456                         goto fail;
2457                 }
2458
2459                 if (o->object.flags & OBJECT_COMPRESSED &&
2460                     !(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED)) {
2461                         log_error("Compressed object without compression at %llu", (unsigned long long) p);
2462                         r = -EBADMSG;
2463                         goto fail;
2464                 }
2465
2466                 if (o->object.flags & OBJECT_COMPRESSED &&
2467                     o->object.type != OBJECT_DATA) {
2468                         log_error("Compressed non-data object at %llu", (unsigned long long) p);
2469                         r = -EBADMSG;
2470                         goto fail;
2471                 }
2472
2473                 if (o->object.type == OBJECT_TAG) {
2474
2475                         if (!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED)) {
2476                                 log_error("Tag object without authentication at %llu", (unsigned long long) p);
2477                                 r = -EBADMSG;
2478                                 goto fail;
2479                         }
2480
2481                         if (le64toh(o->tag.seqnum) != tag_seqnum) {
2482                                 log_error("Tag sequence number out of synchronization at %llu", (unsigned long long) p);
2483                                 r = -EBADMSG;
2484                                 goto fail;
2485                         }
2486
2487                 } else if (o->object.type == OBJECT_ENTRY) {
2488
2489                         r = write_uint64(entry_fd, p);
2490                         if (r < 0)
2491                                 goto fail;
2492
2493                         if (!entry_seqnum_set &&
2494                             le64toh(o->entry.seqnum) != le64toh(f->header->head_entry_seqnum)) {
2495                                 log_error("Head entry sequence number incorrect");
2496                                 r = -EBADMSG;
2497                                 goto fail;
2498                         }
2499
2500                         if (entry_seqnum_set &&
2501                             entry_seqnum >= le64toh(o->entry.seqnum)) {
2502                                 log_error("Entry sequence number out of synchronization at %llu", (unsigned long long) p);
2503                                 r = -EBADMSG;
2504                                 goto fail;
2505                         }
2506
2507                         entry_seqnum = le64toh(o->entry.seqnum);
2508                         entry_seqnum_set = true;
2509
2510                         if (entry_monotonic_set &&
2511                             sd_id128_equal(entry_boot_id, o->entry.boot_id) &&
2512                             entry_monotonic > le64toh(o->entry.monotonic)) {
2513                                 log_error("Entry timestamp out of synchronization at %llu", (unsigned long long) p);
2514                                 r = -EBADMSG;
2515                                 goto fail;
2516                         }
2517
2518                         entry_monotonic = le64toh(o->entry.monotonic);
2519                         entry_boot_id = o->entry.boot_id;
2520                         entry_monotonic_set = true;
2521
2522                         if (!entry_realtime_set &&
2523                             le64toh(o->entry.realtime) != le64toh(f->header->head_entry_realtime)) {
2524                                 log_error("Head entry realtime timestamp incorrect");
2525                                 r = -EBADMSG;
2526                                 goto fail;
2527                         }
2528
2529                         entry_realtime = le64toh(o->entry.realtime);
2530                         entry_realtime_set = true;
2531
2532                         n_entries ++;
2533                 } else if (o->object.type == OBJECT_ENTRY_ARRAY) {
2534
2535                         r = write_uint64(entry_array_fd, p);
2536                         if (r < 0)
2537                                 goto fail;
2538
2539                         if (p == le64toh(f->header->entry_array_offset)) {
2540                                 if (found_main_entry_array) {
2541                                         log_error("More than one main entry array at %llu", (unsigned long long) p);
2542                                         r = -EBADMSG;
2543                                         goto fail;
2544                                 }
2545
2546                                 found_main_entry_array = true;
2547                         }
2548
2549                         n_entry_arrays++;
2550
2551                 } else if (o->object.type == OBJECT_DATA) {
2552
2553                         r = write_uint64(data_fd, p);
2554                         if (r < 0)
2555                                 goto fail;
2556
2557                         n_data++;
2558
2559                 } else if (o->object.type == OBJECT_FIELD)
2560                         n_fields++;
2561                 else if (o->object.type == OBJECT_DATA_HASH_TABLE) {
2562                         n_data_hash_tables++;
2563
2564                         if (n_data_hash_tables > 1) {
2565                                 log_error("More than one data hash table at %llu", (unsigned long long) p);
2566                                 r = -EBADMSG;
2567                                 goto fail;
2568                         }
2569
2570                         if (le64toh(f->header->data_hash_table_offset) != p + offsetof(HashTableObject, items) ||
2571                             le64toh(f->header->data_hash_table_size) != le64toh(o->object.size) - offsetof(HashTableObject, items)) {
2572                                 log_error("Header fields for data hash table invalid.");
2573                                 r = -EBADMSG;
2574                                 goto fail;
2575                         }
2576                 } else if (o->object.type == OBJECT_FIELD_HASH_TABLE) {
2577                         n_field_hash_tables++;
2578
2579                         if (n_field_hash_tables > 1) {
2580                                 log_error("More than one field hash table at %llu", (unsigned long long) p);
2581                                 r = -EBADMSG;
2582                                 goto fail;
2583                         }
2584
2585                         if (le64toh(f->header->field_hash_table_offset) != p + offsetof(HashTableObject, items) ||
2586                             le64toh(f->header->field_hash_table_size) != le64toh(o->object.size) - offsetof(HashTableObject, items)) {
2587                                 log_error("Header fields for field hash table invalid.");
2588                                 r = -EBADMSG;
2589                                 goto fail;
2590                         }
2591                 } else if (o->object.type >= _OBJECT_TYPE_MAX)
2592                         n_weird ++;
2593
2594                 if (p == le64toh(f->header->tail_object_offset))
2595                         p = 0;
2596                 else
2597                         p = p + ALIGN64(le64toh(o->object.size));
2598         }
2599
2600         if (n_objects != le64toh(f->header->n_objects)) {
2601                 log_error("Object number mismatch");
2602                 r = -EBADMSG;
2603                 goto fail;
2604         }
2605
2606         if (n_entries != le64toh(f->header->n_entries)) {
2607                 log_error("Entry number mismatch");
2608                 r = -EBADMSG;
2609                 goto fail;
2610         }
2611
2612         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2613             n_data != le64toh(f->header->n_data)) {
2614                 log_error("Data number mismatch");
2615                 r = -EBADMSG;
2616                 goto fail;
2617         }
2618
2619         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2620             n_fields != le64toh(f->header->n_fields)) {
2621                 log_error("Field number mismatch");
2622                 r = -EBADMSG;
2623                 goto fail;
2624         }
2625
2626         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags) &&
2627             tag_seqnum != le64toh(f->header->n_tags)) {
2628                 log_error("Tag number mismatch");
2629                 r = -EBADMSG;
2630                 goto fail;
2631         }
2632
2633         if (n_data_hash_tables != 1) {
2634                 log_error("Missing data hash table");
2635                 r = -EBADMSG;
2636                 goto fail;
2637         }
2638
2639         if (n_field_hash_tables != 1) {
2640                 log_error("Missing field hash table");
2641                 r = -EBADMSG;
2642                 goto fail;
2643         }
2644
2645         if (!found_main_entry_array) {
2646                 log_error("Missing entry array");
2647                 r = -EBADMSG;
2648                 goto fail;
2649         }
2650
2651         if (entry_seqnum_set &&
2652             entry_seqnum != le64toh(f->header->tail_entry_seqnum)) {
2653                 log_error("Invalid tail seqnum");
2654                 r = -EBADMSG;
2655                 goto fail;
2656         }
2657
2658         if (entry_monotonic_set &&
2659             (!sd_id128_equal(entry_boot_id, f->header->boot_id) ||
2660              entry_monotonic != le64toh(f->header->tail_entry_monotonic))) {
2661                 log_error("Invalid tail monotonic timestamp");
2662                 r = -EBADMSG;
2663                 goto fail;
2664         }
2665
2666         if (entry_realtime_set && entry_realtime != le64toh(f->header->tail_entry_realtime)) {
2667                 log_error("Invalid tail realtime timestamp");
2668                 r = -EBADMSG;
2669                 goto fail;
2670         }
2671
2672         /* Second iteration: we go through all objects again, this
2673          * time verify all pointers. */
2674
2675         p = le64toh(f->header->header_size);
2676         while (p != 0) {
2677                 draw_progress(0x8000 + (0x7FFF * p) / le64toh(f->header->tail_object_offset), &last_usec);
2678
2679                 r = journal_file_move_to_object(f, -1, p, &o);
2680                 if (r < 0) {
2681                         log_error("Invalid object at %llu", (unsigned long long) p);
2682                         goto fail;
2683                 }
2684
2685                 if (o->object.type == OBJECT_ENTRY_ARRAY) {
2686                         uint64_t i = 0, n;
2687
2688                         if (le64toh(o->entry_array.next_entry_array_offset) != 0 &&
2689                             !contains_uint64(f->mmap, entry_array_fd, n_entry_arrays, le64toh(o->entry_array.next_entry_array_offset))) {
2690                                 log_error("Entry array chains up to invalid next array at %llu", (unsigned long long) p);
2691                                 r = -EBADMSG;
2692                                 goto fail;
2693                         }
2694
2695                         n = journal_file_entry_array_n_items(o);
2696                         for (i = 0; i < n; i++) {
2697                                 if (le64toh(o->entry_array.items[i]) != 0 &&
2698                                     !contains_uint64(f->mmap, entry_fd, n_entries, le64toh(o->entry_array.items[i]))) {
2699
2700                                         log_error("Entry array points to invalid next array at %llu", (unsigned long long) p);
2701                                         r = -EBADMSG;
2702                                         goto fail;
2703                                 }
2704                         }
2705
2706                 }
2707
2708                 r = journal_file_move_to_object(f, -1, p, &o);
2709                 if (r < 0) {
2710                         log_error("Invalid object at %llu", (unsigned long long) p);
2711                         goto fail;
2712                 }
2713
2714                 if (p == le64toh(f->header->tail_object_offset))
2715                         p = 0;
2716                 else
2717                         p = p + ALIGN64(le64toh(o->object.size));
2718         }
2719
2720         flush_progress();
2721
2722         mmap_cache_close_fd(f->mmap, data_fd);
2723         mmap_cache_close_fd(f->mmap, entry_fd);
2724         mmap_cache_close_fd(f->mmap, entry_array_fd);
2725
2726         close_nointr_nofail(data_fd);
2727         close_nointr_nofail(entry_fd);
2728         close_nointr_nofail(entry_array_fd);
2729
2730         return 0;
2731
2732 fail:
2733         flush_progress();
2734
2735         log_error("File corruption detected at %s:%llu (of %llu, %llu%%).",
2736                   f->path,
2737                   (unsigned long long) p,
2738                   (unsigned long long) f->last_stat.st_size,
2739                   (unsigned long long) (100 * p / f->last_stat.st_size));
2740
2741         if (data_fd >= 0) {
2742                 mmap_cache_close_fd(f->mmap, data_fd);
2743                 close_nointr_nofail(data_fd);
2744         }
2745
2746         if (entry_fd >= 0) {
2747                 mmap_cache_close_fd(f->mmap, entry_fd);
2748                 close_nointr_nofail(entry_fd);
2749         }
2750
2751         if (entry_array_fd >= 0) {
2752                 mmap_cache_close_fd(f->mmap, entry_array_fd);
2753                 close_nointr_nofail(entry_array_fd);
2754         }
2755
2756         return r;
2757 }
2758
2759 void journal_file_dump(JournalFile *f) {
2760         Object *o;
2761         int r;
2762         uint64_t p;
2763
2764         assert(f);
2765
2766         journal_file_print_header(f);
2767
2768         p = le64toh(f->header->header_size);
2769         while (p != 0) {
2770                 r = journal_file_move_to_object(f, -1, p, &o);
2771                 if (r < 0)
2772                         goto fail;
2773
2774                 switch (o->object.type) {
2775
2776                 case OBJECT_UNUSED:
2777                         printf("Type: OBJECT_UNUSED\n");
2778                         break;
2779
2780                 case OBJECT_DATA:
2781                         printf("Type: OBJECT_DATA\n");
2782                         break;
2783
2784                 case OBJECT_ENTRY:
2785                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
2786                                (unsigned long long) le64toh(o->entry.seqnum),
2787                                (unsigned long long) le64toh(o->entry.monotonic),
2788                                (unsigned long long) le64toh(o->entry.realtime));
2789                         break;
2790
2791                 case OBJECT_FIELD_HASH_TABLE:
2792                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2793                         break;
2794
2795                 case OBJECT_DATA_HASH_TABLE:
2796                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2797                         break;
2798
2799                 case OBJECT_ENTRY_ARRAY:
2800                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2801                         break;
2802
2803                 case OBJECT_TAG:
2804                         printf("Type: OBJECT_TAG %llu\n",
2805                                (unsigned long long) le64toh(o->tag.seqnum));
2806                         break;
2807                 }
2808
2809                 if (o->object.flags & OBJECT_COMPRESSED)
2810                         printf("Flags: COMPRESSED\n");
2811
2812                 if (p == le64toh(f->header->tail_object_offset))
2813                         p = 0;
2814                 else
2815                         p = p + ALIGN64(le64toh(o->object.size));
2816         }
2817
2818         return;
2819 fail:
2820         log_error("File corrupt");
2821 }
2822
2823 void journal_file_print_header(JournalFile *f) {
2824         char a[33], b[33], c[33];
2825         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2826
2827         assert(f);
2828
2829         printf("File Path: %s\n"
2830                "File ID: %s\n"
2831                "Machine ID: %s\n"
2832                "Boot ID: %s\n"
2833                "Sequential Number ID: %s\n"
2834                "State: %s\n"
2835                "Compatible Flags:%s%s\n"
2836                "Incompatible Flags:%s%s\n"
2837                "Header size: %llu\n"
2838                "Arena size: %llu\n"
2839                "Data Hash Table Size: %llu\n"
2840                "Field Hash Table Size: %llu\n"
2841                "Objects: %llu\n"
2842                "Entry Objects: %llu\n"
2843                "Rotate Suggested: %s\n"
2844                "Head Sequential Number: %llu\n"
2845                "Tail Sequential Number: %llu\n"
2846                "Head Realtime Timestamp: %s\n"
2847                "Tail Realtime Timestamp: %s\n",
2848                f->path,
2849                sd_id128_to_string(f->header->file_id, a),
2850                sd_id128_to_string(f->header->machine_id, b),
2851                sd_id128_to_string(f->header->boot_id, c),
2852                sd_id128_to_string(f->header->seqnum_id, c),
2853                f->header->state == STATE_OFFLINE ? "offline" :
2854                f->header->state == STATE_ONLINE ? "online" :
2855                f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
2856                (f->header->compatible_flags & HEADER_COMPATIBLE_AUTHENTICATED) ? " AUTHENTICATED" : "",
2857                (f->header->compatible_flags & ~HEADER_COMPATIBLE_AUTHENTICATED) ? " ???" : "",
2858                (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
2859                (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2860                (unsigned long long) le64toh(f->header->header_size),
2861                (unsigned long long) le64toh(f->header->arena_size),
2862                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2863                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2864                (unsigned long long) le64toh(f->header->n_objects),
2865                (unsigned long long) le64toh(f->header->n_entries),
2866                yes_no(journal_file_rotate_suggested(f)),
2867                (unsigned long long) le64toh(f->header->head_entry_seqnum),
2868                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2869                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2870                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
2871
2872         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2873                 printf("Data Objects: %llu\n"
2874                        "Data Hash Table Fill: %.1f%%\n",
2875                        (unsigned long long) le64toh(f->header->n_data),
2876                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2877
2878         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2879                 printf("Field Objects: %llu\n"
2880                        "Field Hash Table Fill: %.1f%%\n",
2881                        (unsigned long long) le64toh(f->header->n_fields),
2882                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2883 }
2884
2885 int journal_file_open(
2886                 const char *fname,
2887                 int flags,
2888                 mode_t mode,
2889                 bool compress,
2890                 bool authenticate,
2891                 JournalMetrics *metrics,
2892                 MMapCache *mmap_cache,
2893                 JournalFile *template,
2894                 JournalFile **ret) {
2895
2896         JournalFile *f;
2897         int r;
2898         bool newly_created = false;
2899
2900         assert(fname);
2901
2902         if ((flags & O_ACCMODE) != O_RDONLY &&
2903             (flags & O_ACCMODE) != O_RDWR)
2904                 return -EINVAL;
2905
2906         if (!endswith(fname, ".journal"))
2907                 return -EINVAL;
2908
2909         f = new0(JournalFile, 1);
2910         if (!f)
2911                 return -ENOMEM;
2912
2913         f->fd = -1;
2914         f->mode = mode;
2915
2916         f->flags = flags;
2917         f->prot = prot_from_flags(flags);
2918         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2919         f->compress = compress;
2920         f->authenticate = authenticate;
2921
2922         if (mmap_cache)
2923                 f->mmap = mmap_cache_ref(mmap_cache);
2924         else {
2925                 /* One context for each type, plus the zeroth catchall
2926                  * context. One fd for the file plus one for each type
2927                  * (which we need during verification */
2928                 f->mmap = mmap_cache_new(_OBJECT_TYPE_MAX, 1 + _OBJECT_TYPE_MAX);
2929                 if (!f->mmap) {
2930                         r = -ENOMEM;
2931                         goto fail;
2932                 }
2933         }
2934
2935         f->path = strdup(fname);
2936         if (!f->path) {
2937                 r = -ENOMEM;
2938                 goto fail;
2939         }
2940
2941         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2942         if (f->fd < 0) {
2943                 r = -errno;
2944                 goto fail;
2945         }
2946
2947         if (fstat(f->fd, &f->last_stat) < 0) {
2948                 r = -errno;
2949                 goto fail;
2950         }
2951
2952         if (f->last_stat.st_size == 0 && f->writable) {
2953                 newly_created = true;
2954
2955                 /* Try to load the FSPRG state, and if we can't, then
2956                  * just don't do authentication */
2957                 r = journal_file_load_fsprg(f);
2958                 if (r < 0)
2959                         f->authenticate = false;
2960
2961                 r = journal_file_init_header(f, template);
2962                 if (r < 0)
2963                         goto fail;
2964
2965                 if (fstat(f->fd, &f->last_stat) < 0) {
2966                         r = -errno;
2967                         goto fail;
2968                 }
2969         }
2970
2971         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2972                 r = -EIO;
2973                 goto fail;
2974         }
2975
2976         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2977         if (f->header == MAP_FAILED) {
2978                 f->header = NULL;
2979                 r = -errno;
2980                 goto fail;
2981         }
2982
2983         if (!newly_created) {
2984                 r = journal_file_verify_header(f);
2985                 if (r < 0)
2986                         goto fail;
2987         }
2988
2989         if (!newly_created && f->writable) {
2990                 r = journal_file_load_fsprg(f);
2991                 if (r < 0)
2992                         goto fail;
2993         }
2994
2995         if (f->writable) {
2996                 if (metrics) {
2997                         journal_default_metrics(metrics, f->fd);
2998                         f->metrics = *metrics;
2999                 } else if (template)
3000                         f->metrics = template->metrics;
3001
3002                 r = journal_file_refresh_header(f);
3003                 if (r < 0)
3004                         goto fail;
3005
3006                 r = journal_file_setup_hmac(f);
3007                 if (r < 0)
3008                         goto fail;
3009         }
3010
3011         if (newly_created) {
3012                 r = journal_file_setup_field_hash_table(f);
3013                 if (r < 0)
3014                         goto fail;
3015
3016                 r = journal_file_setup_data_hash_table(f);
3017                 if (r < 0)
3018                         goto fail;
3019
3020                 r = journal_file_append_first_tag(f);
3021                 if (r < 0)
3022                         goto fail;
3023         }
3024
3025         r = journal_file_map_field_hash_table(f);
3026         if (r < 0)
3027                 goto fail;
3028
3029         r = journal_file_map_data_hash_table(f);
3030         if (r < 0)
3031                 goto fail;
3032
3033         if (ret)
3034                 *ret = f;
3035
3036         return 0;
3037
3038 fail:
3039         journal_file_close(f);
3040
3041         return r;
3042 }
3043
3044 int journal_file_rotate(JournalFile **f, bool compress, bool authenticate) {
3045         char *p;
3046         size_t l;
3047         JournalFile *old_file, *new_file = NULL;
3048         int r;
3049
3050         assert(f);
3051         assert(*f);
3052
3053         old_file = *f;
3054
3055         if (!old_file->writable)
3056                 return -EINVAL;
3057
3058         if (!endswith(old_file->path, ".journal"))
3059                 return -EINVAL;
3060
3061         l = strlen(old_file->path);
3062
3063         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
3064         if (!p)
3065                 return -ENOMEM;
3066
3067         memcpy(p, old_file->path, l - 8);
3068         p[l-8] = '@';
3069         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
3070         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
3071                  "-%016llx-%016llx.journal",
3072                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
3073                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
3074
3075         r = rename(old_file->path, p);
3076         free(p);
3077
3078         if (r < 0)
3079                 return -errno;
3080
3081         old_file->header->state = STATE_ARCHIVED;
3082
3083         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, authenticate, NULL, old_file->mmap, old_file, &new_file);
3084         journal_file_close(old_file);
3085
3086         *f = new_file;
3087         return r;
3088 }
3089
3090 int journal_file_open_reliably(
3091                 const char *fname,
3092                 int flags,
3093                 mode_t mode,
3094                 bool compress,
3095                 bool authenticate,
3096                 JournalMetrics *metrics,
3097                 MMapCache *mmap,
3098                 JournalFile *template,
3099                 JournalFile **ret) {
3100
3101         int r;
3102         size_t l;
3103         char *p;
3104
3105         r = journal_file_open(fname, flags, mode, compress, authenticate, metrics, mmap, template, ret);
3106         if (r != -EBADMSG && /* corrupted */
3107             r != -ENODATA && /* truncated */
3108             r != -EHOSTDOWN && /* other machine */
3109             r != -EPROTONOSUPPORT && /* incompatible feature */
3110             r != -EBUSY && /* unclean shutdown */
3111             r != -ESHUTDOWN /* already archived */)
3112                 return r;
3113
3114         if ((flags & O_ACCMODE) == O_RDONLY)
3115                 return r;
3116
3117         if (!(flags & O_CREAT))
3118                 return r;
3119
3120         if (!endswith(fname, ".journal"))
3121                 return r;
3122
3123         /* The file is corrupted. Rotate it away and try it again (but only once) */
3124
3125         l = strlen(fname);
3126         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
3127                      (int) (l-8), fname,
3128                      (unsigned long long) now(CLOCK_REALTIME),
3129                      random_ull()) < 0)
3130                 return -ENOMEM;
3131
3132         r = rename(fname, p);
3133         free(p);
3134         if (r < 0)
3135                 return -errno;
3136
3137         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3138
3139         return journal_file_open(fname, flags, mode, compress, authenticate, metrics, mmap, template, ret);
3140 }
3141
3142 struct vacuum_info {
3143         off_t usage;
3144         char *filename;
3145
3146         uint64_t realtime;
3147         sd_id128_t seqnum_id;
3148         uint64_t seqnum;
3149
3150         bool have_seqnum;
3151 };
3152
3153 static int vacuum_compare(const void *_a, const void *_b) {
3154         const struct vacuum_info *a, *b;
3155
3156         a = _a;
3157         b = _b;
3158
3159         if (a->have_seqnum && b->have_seqnum &&
3160             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
3161                 if (a->seqnum < b->seqnum)
3162                         return -1;
3163                 else if (a->seqnum > b->seqnum)
3164                         return 1;
3165                 else
3166                         return 0;
3167         }
3168
3169         if (a->realtime < b->realtime)
3170                 return -1;
3171         else if (a->realtime > b->realtime)
3172                 return 1;
3173         else if (a->have_seqnum && b->have_seqnum)
3174                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
3175         else
3176                 return strcmp(a->filename, b->filename);
3177 }
3178
3179 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
3180         DIR *d;
3181         int r = 0;
3182         struct vacuum_info *list = NULL;
3183         unsigned n_list = 0, n_allocated = 0, i;
3184         uint64_t sum = 0;
3185
3186         assert(directory);
3187
3188         if (max_use <= 0)
3189                 return 0;
3190
3191         d = opendir(directory);
3192         if (!d)
3193                 return -errno;
3194
3195         for (;;) {
3196                 int k;
3197                 struct dirent buf, *de;
3198                 size_t q;
3199                 struct stat st;
3200                 char *p;
3201                 unsigned long long seqnum = 0, realtime;
3202                 sd_id128_t seqnum_id;
3203                 bool have_seqnum;
3204
3205                 k = readdir_r(d, &buf, &de);
3206                 if (k != 0) {
3207                         r = -k;
3208                         goto finish;
3209                 }
3210
3211                 if (!de)
3212                         break;
3213
3214                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
3215                         continue;
3216
3217                 if (!S_ISREG(st.st_mode))
3218                         continue;
3219
3220                 q = strlen(de->d_name);
3221
3222                 if (endswith(de->d_name, ".journal")) {
3223
3224                         /* Vacuum archived files */
3225
3226                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
3227                                 continue;
3228
3229                         if (de->d_name[q-8-16-1] != '-' ||
3230                             de->d_name[q-8-16-1-16-1] != '-' ||
3231                             de->d_name[q-8-16-1-16-1-32-1] != '@')
3232                                 continue;
3233
3234                         p = strdup(de->d_name);
3235                         if (!p) {
3236                                 r = -ENOMEM;
3237                                 goto finish;
3238                         }
3239
3240                         de->d_name[q-8-16-1-16-1] = 0;
3241                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
3242                                 free(p);
3243                                 continue;
3244                         }
3245
3246                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
3247                                 free(p);
3248                                 continue;
3249                         }
3250
3251                         have_seqnum = true;
3252
3253                 } else if (endswith(de->d_name, ".journal~")) {
3254                         unsigned long long tmp;
3255
3256                         /* Vacuum corrupted files */
3257
3258                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
3259                                 continue;
3260
3261                         if (de->d_name[q-1-8-16-1] != '-' ||
3262                             de->d_name[q-1-8-16-1-16-1] != '@')
3263                                 continue;
3264
3265                         p = strdup(de->d_name);
3266                         if (!p) {
3267                                 r = -ENOMEM;
3268                                 goto finish;
3269                         }
3270
3271                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
3272                                 free(p);
3273                                 continue;
3274                         }
3275
3276                         have_seqnum = false;
3277                 } else
3278                         continue;
3279
3280                 if (n_list >= n_allocated) {
3281                         struct vacuum_info *j;
3282
3283                         n_allocated = MAX(n_allocated * 2U, 8U);
3284                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
3285                         if (!j) {
3286                                 free(p);
3287                                 r = -ENOMEM;
3288                                 goto finish;
3289                         }
3290
3291                         list = j;
3292                 }
3293
3294                 list[n_list].filename = p;
3295                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
3296                 list[n_list].seqnum = seqnum;
3297                 list[n_list].realtime = realtime;
3298                 list[n_list].seqnum_id = seqnum_id;
3299                 list[n_list].have_seqnum = have_seqnum;
3300
3301                 sum += list[n_list].usage;
3302
3303                 n_list ++;
3304         }
3305
3306         if (n_list > 0)
3307                 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
3308
3309         for(i = 0; i < n_list; i++) {
3310                 struct statvfs ss;
3311
3312                 if (fstatvfs(dirfd(d), &ss) < 0) {
3313                         r = -errno;
3314                         goto finish;
3315                 }
3316
3317                 if (sum <= max_use &&
3318                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
3319                         break;
3320
3321                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
3322                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
3323                         sum -= list[i].usage;
3324                 } else if (errno != ENOENT)
3325                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
3326         }
3327
3328 finish:
3329         for (i = 0; i < n_list; i++)
3330                 free(list[i].filename);
3331
3332         free(list);
3333
3334         if (d)
3335                 closedir(d);
3336
3337         return r;
3338 }
3339
3340 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3341         uint64_t i, n;
3342         uint64_t q, xor_hash = 0;
3343         int r;
3344         EntryItem *items;
3345         dual_timestamp ts;
3346
3347         assert(from);
3348         assert(to);
3349         assert(o);
3350         assert(p);
3351
3352         if (!to->writable)
3353                 return -EPERM;
3354
3355         ts.monotonic = le64toh(o->entry.monotonic);
3356         ts.realtime = le64toh(o->entry.realtime);
3357
3358         if (to->tail_entry_monotonic_valid &&
3359             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
3360                 return -EINVAL;
3361
3362         n = journal_file_entry_n_items(o);
3363         items = alloca(sizeof(EntryItem) * n);
3364
3365         for (i = 0; i < n; i++) {
3366                 uint64_t l, h;
3367                 le64_t le_hash;
3368                 size_t t;
3369                 void *data;
3370                 Object *u;
3371
3372                 q = le64toh(o->entry.items[i].object_offset);
3373                 le_hash = o->entry.items[i].hash;
3374
3375                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3376                 if (r < 0)
3377                         return r;
3378
3379                 if (le_hash != o->data.hash)
3380                         return -EBADMSG;
3381
3382                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3383                 t = (size_t) l;
3384
3385                 /* We hit the limit on 32bit machines */
3386                 if ((uint64_t) t != l)
3387                         return -E2BIG;
3388
3389                 if (o->object.flags & OBJECT_COMPRESSED) {
3390 #ifdef HAVE_XZ
3391                         uint64_t rsize;
3392
3393                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
3394                                 return -EBADMSG;
3395
3396                         data = from->compress_buffer;
3397                         l = rsize;
3398 #else
3399                         return -EPROTONOSUPPORT;
3400 #endif
3401                 } else
3402                         data = o->data.payload;
3403
3404                 r = journal_file_append_data(to, data, l, &u, &h);
3405                 if (r < 0)
3406                         return r;
3407
3408                 xor_hash ^= le64toh(u->data.hash);
3409                 items[i].object_offset = htole64(h);
3410                 items[i].hash = u->data.hash;
3411
3412                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3413                 if (r < 0)
3414                         return r;
3415         }
3416
3417         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3418 }
3419
3420 void journal_default_metrics(JournalMetrics *m, int fd) {
3421         uint64_t fs_size = 0;
3422         struct statvfs ss;
3423         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
3424
3425         assert(m);
3426         assert(fd >= 0);
3427
3428         if (fstatvfs(fd, &ss) >= 0)
3429                 fs_size = ss.f_frsize * ss.f_blocks;
3430
3431         if (m->max_use == (uint64_t) -1) {
3432
3433                 if (fs_size > 0) {
3434                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3435
3436                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3437                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3438
3439                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3440                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3441                 } else
3442                         m->max_use = DEFAULT_MAX_USE_LOWER;
3443         } else {
3444                 m->max_use = PAGE_ALIGN(m->max_use);
3445
3446                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3447                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3448         }
3449
3450         if (m->max_size == (uint64_t) -1) {
3451                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3452
3453                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3454                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3455         } else
3456                 m->max_size = PAGE_ALIGN(m->max_size);
3457
3458         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3459                 m->max_size = JOURNAL_FILE_SIZE_MIN;
3460
3461         if (m->max_size*2 > m->max_use)
3462                 m->max_use = m->max_size*2;
3463
3464         if (m->min_size == (uint64_t) -1)
3465                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3466         else {
3467                 m->min_size = PAGE_ALIGN(m->min_size);
3468
3469                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3470                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3471
3472                 if (m->min_size > m->max_size)
3473                         m->max_size = m->min_size;
3474         }
3475
3476         if (m->keep_free == (uint64_t) -1) {
3477
3478                 if (fs_size > 0) {
3479                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
3480
3481                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3482                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3483
3484                 } else
3485                         m->keep_free = DEFAULT_KEEP_FREE;
3486         }
3487
3488         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
3489                  format_bytes(a, sizeof(a), m->max_use),
3490                  format_bytes(b, sizeof(b), m->max_size),
3491                  format_bytes(c, sizeof(c), m->min_size),
3492                  format_bytes(d, sizeof(d), m->keep_free));
3493 }
3494
3495 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3496         assert(f);
3497         assert(from || to);
3498
3499         if (from) {
3500                 if (f->header->head_entry_realtime == 0)
3501                         return -ENOENT;
3502
3503                 *from = le64toh(f->header->head_entry_realtime);
3504         }
3505
3506         if (to) {
3507                 if (f->header->tail_entry_realtime == 0)
3508                         return -ENOENT;
3509
3510                 *to = le64toh(f->header->tail_entry_realtime);
3511         }
3512
3513         return 1;
3514 }
3515
3516 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3517         char t[9+32+1] = "_BOOT_ID=";
3518         Object *o;
3519         uint64_t p;
3520         int r;
3521
3522         assert(f);
3523         assert(from || to);
3524
3525         sd_id128_to_string(boot_id, t + 9);
3526
3527         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
3528         if (r <= 0)
3529                 return r;
3530
3531         if (le64toh(o->data.n_entries) <= 0)
3532                 return 0;
3533
3534         if (from) {
3535                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3536                 if (r < 0)
3537                         return r;
3538
3539                 *from = le64toh(o->entry.monotonic);
3540         }
3541
3542         if (to) {
3543                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3544                 if (r < 0)
3545                         return r;
3546
3547                 r = generic_array_get_plus_one(f,
3548                                                le64toh(o->data.entry_offset),
3549                                                le64toh(o->data.entry_array_offset),
3550                                                le64toh(o->data.n_entries)-1,
3551                                                &o, NULL);
3552                 if (r <= 0)
3553                         return r;
3554
3555                 *to = le64toh(o->entry.monotonic);
3556         }
3557
3558         return 1;
3559 }
3560
3561 bool journal_file_rotate_suggested(JournalFile *f) {
3562         assert(f);
3563
3564         /* If we gained new header fields we gained new features,
3565          * hence suggest a rotation */
3566         if (le64toh(f->header->header_size) < sizeof(Header)) {
3567                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3568                 return true;
3569         }
3570
3571         /* Let's check if the hash tables grew over a certain fill
3572          * level (75%, borrowing this value from Java's hash table
3573          * implementation), and if so suggest a rotation. To calculate
3574          * the fill level we need the n_data field, which only exists
3575          * in newer versions. */
3576
3577         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3578                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3579                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
3580                                   f->path,
3581                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3582                                   (unsigned long long) le64toh(f->header->n_data),
3583                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
3584                                   (unsigned long long) (f->last_stat.st_size),
3585                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
3586                         return true;
3587                 }
3588
3589         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3590                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3591                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
3592                                   f->path,
3593                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3594                                   (unsigned long long) le64toh(f->header->n_fields),
3595                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
3596                         return true;
3597                 }
3598
3599         return false;
3600 }