chiark / gitweb /
journal: implement basic journal file verification logic
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34 #include "fsprg.h"
35
36 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
38
39 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
40
41 /* This is the minimum journal file size */
42 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
43
44 /* These are the lower and upper bounds if we deduce the max_use value
45  * from the file system size */
46 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
47 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
48
49 /* This is the upper bound if we deduce max_size from max_use */
50 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
51
52 /* This is the upper bound if we deduce the keep_free value from the
53  * file system size */
54 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
55
56 /* This is the keep_free value when we can't determine the system
57  * size */
58 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
59
60 /* n_data was the first entry we added after the initial file format design */
61 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 #define JOURNAL_HEADER_CONTAINS(h, field) \
66         (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
67
68 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime);
69 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p);
70
71 void journal_file_close(JournalFile *f) {
72         assert(f);
73
74         /* Write the final tag */
75         if (f->authenticate)
76                 journal_file_append_tag(f);
77
78         /* Sync everything to disk, before we mark the file offline */
79         if (f->mmap && f->fd >= 0)
80                 mmap_cache_close_fd(f->mmap, f->fd);
81
82         if (f->writable && f->fd >= 0)
83                 fdatasync(f->fd);
84
85         if (f->header) {
86                 /* Mark the file offline. Don't override the archived state if it already is set */
87                 if (f->writable && f->header->state == STATE_ONLINE)
88                         f->header->state = STATE_OFFLINE;
89
90                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
91         }
92
93         if (f->fd >= 0)
94                 close_nointr_nofail(f->fd);
95
96         free(f->path);
97
98         if (f->mmap)
99                 mmap_cache_unref(f->mmap);
100
101 #ifdef HAVE_XZ
102         free(f->compress_buffer);
103 #endif
104
105 #ifdef HAVE_GCRYPT
106         if (f->fsprg_header)
107                 munmap(f->fsprg_header, PAGE_ALIGN(f->fsprg_size));
108
109         if (f->hmac)
110                 gcry_md_close(f->hmac);
111 #endif
112
113         free(f);
114 }
115
116 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
117         Header h;
118         ssize_t k;
119         int r;
120
121         assert(f);
122
123         zero(h);
124         memcpy(h.signature, HEADER_SIGNATURE, 8);
125         h.header_size = htole64(ALIGN64(sizeof(h)));
126
127         h.incompatible_flags =
128                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
129
130         h.compatible_flags =
131                 htole32(f->authenticate ? HEADER_COMPATIBLE_AUTHENTICATED : 0);
132
133         r = sd_id128_randomize(&h.file_id);
134         if (r < 0)
135                 return r;
136
137         if (template) {
138                 h.seqnum_id = template->header->seqnum_id;
139                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
140         } else
141                 h.seqnum_id = h.file_id;
142
143         k = pwrite(f->fd, &h, sizeof(h), 0);
144         if (k < 0)
145                 return -errno;
146
147         if (k != sizeof(h))
148                 return -EIO;
149
150         return 0;
151 }
152
153 static int journal_file_refresh_header(JournalFile *f) {
154         int r;
155         sd_id128_t boot_id;
156
157         assert(f);
158
159         r = sd_id128_get_machine(&f->header->machine_id);
160         if (r < 0)
161                 return r;
162
163         r = sd_id128_get_boot(&boot_id);
164         if (r < 0)
165                 return r;
166
167         if (sd_id128_equal(boot_id, f->header->boot_id))
168                 f->tail_entry_monotonic_valid = true;
169
170         f->header->boot_id = boot_id;
171
172         f->header->state = STATE_ONLINE;
173
174         /* Sync the online state to disk */
175         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
176         fdatasync(f->fd);
177
178         return 0;
179 }
180
181 static int journal_file_verify_header(JournalFile *f) {
182         assert(f);
183
184         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
185                 return -EBADMSG;
186
187         /* In both read and write mode we refuse to open files with
188          * incompatible flags we don't know */
189 #ifdef HAVE_XZ
190         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
191                 return -EPROTONOSUPPORT;
192 #else
193         if (f->header->incompatible_flags != 0)
194                 return -EPROTONOSUPPORT;
195 #endif
196
197         /* When open for writing we refuse to open files with
198          * compatible flags, too */
199         if (f->writable) {
200 #ifdef HAVE_GCRYPT
201                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_AUTHENTICATED) != 0)
202                         return -EPROTONOSUPPORT;
203 #else
204                 if (f->header->compatible_flags != 0)
205                         return -EPROTONOSUPPORT;
206 #endif
207         }
208
209         /* The first addition was n_data, so check that we are at least this large */
210         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
211                 return -EBADMSG;
212
213         if ((le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED) &&
214                 !JOURNAL_HEADER_CONTAINS(f->header, n_tags))
215                 return -EBADMSG;
216
217         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
218                 return -ENODATA;
219
220         if (f->writable) {
221                 uint8_t state;
222                 sd_id128_t machine_id;
223                 int r;
224
225                 r = sd_id128_get_machine(&machine_id);
226                 if (r < 0)
227                         return r;
228
229                 if (!sd_id128_equal(machine_id, f->header->machine_id))
230                         return -EHOSTDOWN;
231
232                 state = f->header->state;
233
234                 if (state == STATE_ONLINE) {
235                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
236                         return -EBUSY;
237                 } else if (state == STATE_ARCHIVED)
238                         return -ESHUTDOWN;
239                 else if (state != STATE_OFFLINE) {
240                         log_debug("Journal file %s has unknown state %u.", f->path, state);
241                         return -EBUSY;
242                 }
243         }
244
245         f->compress = !!(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED);
246         f->authenticate = !!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED);
247
248         return 0;
249 }
250
251 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
252         uint64_t old_size, new_size;
253         int r;
254
255         assert(f);
256
257         /* We assume that this file is not sparse, and we know that
258          * for sure, since we always call posix_fallocate()
259          * ourselves */
260
261         old_size =
262                 le64toh(f->header->header_size) +
263                 le64toh(f->header->arena_size);
264
265         new_size = PAGE_ALIGN(offset + size);
266         if (new_size < le64toh(f->header->header_size))
267                 new_size = le64toh(f->header->header_size);
268
269         if (new_size <= old_size)
270                 return 0;
271
272         if (f->metrics.max_size > 0 &&
273             new_size > f->metrics.max_size)
274                 return -E2BIG;
275
276         if (new_size > f->metrics.min_size &&
277             f->metrics.keep_free > 0) {
278                 struct statvfs svfs;
279
280                 if (fstatvfs(f->fd, &svfs) >= 0) {
281                         uint64_t available;
282
283                         available = svfs.f_bfree * svfs.f_bsize;
284
285                         if (available >= f->metrics.keep_free)
286                                 available -= f->metrics.keep_free;
287                         else
288                                 available = 0;
289
290                         if (new_size - old_size > available)
291                                 return -E2BIG;
292                 }
293         }
294
295         /* Note that the glibc fallocate() fallback is very
296            inefficient, hence we try to minimize the allocation area
297            as we can. */
298         r = posix_fallocate(f->fd, old_size, new_size - old_size);
299         if (r != 0)
300                 return -r;
301
302         if (fstat(f->fd, &f->last_stat) < 0)
303                 return -errno;
304
305         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
306
307         return 0;
308 }
309
310 static int journal_file_move_to(JournalFile *f, int context, uint64_t offset, uint64_t size, void **ret) {
311         assert(f);
312         assert(ret);
313
314         /* Avoid SIGBUS on invalid accesses */
315         if (offset + size > (uint64_t) f->last_stat.st_size) {
316                 /* Hmm, out of range? Let's refresh the fstat() data
317                  * first, before we trust that check. */
318
319                 if (fstat(f->fd, &f->last_stat) < 0 ||
320                     offset + size > (uint64_t) f->last_stat.st_size)
321                         return -EADDRNOTAVAIL;
322         }
323
324         return mmap_cache_get(f->mmap, f->fd, f->prot, context, offset, size, ret);
325 }
326
327 static bool verify_hash(Object *o) {
328         uint64_t h1, h2;
329
330         assert(o);
331
332         if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
333                 h1 = le64toh(o->data.hash);
334                 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
335         } else if (o->object.type == OBJECT_FIELD) {
336                 h1 = le64toh(o->field.hash);
337                 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
338         } else
339                 return true;
340
341         return h1 == h2;
342 }
343
344 static uint64_t minimum_header_size(Object *o) {
345
346         static uint64_t table[] = {
347                 [OBJECT_DATA] = sizeof(DataObject),
348                 [OBJECT_FIELD] = sizeof(FieldObject),
349                 [OBJECT_ENTRY] = sizeof(EntryObject),
350                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
351                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
352                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
353                 [OBJECT_TAG] = sizeof(TagObject),
354         };
355
356         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
357                 return sizeof(ObjectHeader);
358
359         return table[o->object.type];
360 }
361
362 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
363         int r;
364         void *t;
365         Object *o;
366         uint64_t s;
367         unsigned context;
368
369         assert(f);
370         assert(ret);
371
372         /* One context for each type, plus one catch-all for the rest */
373         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
374
375         r = journal_file_move_to(f, context, offset, sizeof(ObjectHeader), &t);
376         if (r < 0)
377                 return r;
378
379         o = (Object*) t;
380         s = le64toh(o->object.size);
381
382         if (s < sizeof(ObjectHeader))
383                 return -EBADMSG;
384
385         if (o->object.type <= OBJECT_UNUSED)
386                 return -EBADMSG;
387
388         if (s < minimum_header_size(o))
389                 return -EBADMSG;
390
391         if (type >= 0 && o->object.type != type)
392                 return -EBADMSG;
393
394         if (s > sizeof(ObjectHeader)) {
395                 r = journal_file_move_to(f, o->object.type, offset, s, &t);
396                 if (r < 0)
397                         return r;
398
399                 o = (Object*) t;
400         }
401
402         if (!verify_hash(o))
403                 return -EBADMSG;
404
405         *ret = o;
406         return 0;
407 }
408
409 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
410         uint64_t r;
411
412         assert(f);
413
414         r = le64toh(f->header->tail_entry_seqnum) + 1;
415
416         if (seqnum) {
417                 /* If an external seqnum counter was passed, we update
418                  * both the local and the external one, and set it to
419                  * the maximum of both */
420
421                 if (*seqnum + 1 > r)
422                         r = *seqnum + 1;
423
424                 *seqnum = r;
425         }
426
427         f->header->tail_entry_seqnum = htole64(r);
428
429         if (f->header->head_entry_seqnum == 0)
430                 f->header->head_entry_seqnum = htole64(r);
431
432         return r;
433 }
434
435 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
436         int r;
437         uint64_t p;
438         Object *tail, *o;
439         void *t;
440
441         assert(f);
442         assert(type > 0 && type < _OBJECT_TYPE_MAX);
443         assert(size >= sizeof(ObjectHeader));
444         assert(offset);
445         assert(ret);
446
447         p = le64toh(f->header->tail_object_offset);
448         if (p == 0)
449                 p = le64toh(f->header->header_size);
450         else {
451                 r = journal_file_move_to_object(f, -1, p, &tail);
452                 if (r < 0)
453                         return r;
454
455                 p += ALIGN64(le64toh(tail->object.size));
456         }
457
458         r = journal_file_allocate(f, p, size);
459         if (r < 0)
460                 return r;
461
462         r = journal_file_move_to(f, type, p, size, &t);
463         if (r < 0)
464                 return r;
465
466         o = (Object*) t;
467
468         zero(o->object);
469         o->object.type = type;
470         o->object.size = htole64(size);
471
472         f->header->tail_object_offset = htole64(p);
473         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
474
475         *ret = o;
476         *offset = p;
477
478         return 0;
479 }
480
481 static int journal_file_setup_data_hash_table(JournalFile *f) {
482         uint64_t s, p;
483         Object *o;
484         int r;
485
486         assert(f);
487
488         /* We estimate that we need 1 hash table entry per 768 of
489            journal file and we want to make sure we never get beyond
490            75% fill level. Calculate the hash table size for the
491            maximum file size based on these metrics. */
492
493         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
494         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
495                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
496
497         log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
498
499         r = journal_file_append_object(f,
500                                        OBJECT_DATA_HASH_TABLE,
501                                        offsetof(Object, hash_table.items) + s,
502                                        &o, &p);
503         if (r < 0)
504                 return r;
505
506         memset(o->hash_table.items, 0, s);
507
508         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
509         f->header->data_hash_table_size = htole64(s);
510
511         return 0;
512 }
513
514 static int journal_file_setup_field_hash_table(JournalFile *f) {
515         uint64_t s, p;
516         Object *o;
517         int r;
518
519         assert(f);
520
521         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
522         r = journal_file_append_object(f,
523                                        OBJECT_FIELD_HASH_TABLE,
524                                        offsetof(Object, hash_table.items) + s,
525                                        &o, &p);
526         if (r < 0)
527                 return r;
528
529         memset(o->hash_table.items, 0, s);
530
531         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
532         f->header->field_hash_table_size = htole64(s);
533
534         return 0;
535 }
536
537 static int journal_file_map_data_hash_table(JournalFile *f) {
538         uint64_t s, p;
539         void *t;
540         int r;
541
542         assert(f);
543
544         p = le64toh(f->header->data_hash_table_offset);
545         s = le64toh(f->header->data_hash_table_size);
546
547         r = journal_file_move_to(f,
548                                  OBJECT_DATA_HASH_TABLE,
549                                  p, s,
550                                  &t);
551         if (r < 0)
552                 return r;
553
554         f->data_hash_table = t;
555         return 0;
556 }
557
558 static int journal_file_map_field_hash_table(JournalFile *f) {
559         uint64_t s, p;
560         void *t;
561         int r;
562
563         assert(f);
564
565         p = le64toh(f->header->field_hash_table_offset);
566         s = le64toh(f->header->field_hash_table_size);
567
568         r = journal_file_move_to(f,
569                                  OBJECT_FIELD_HASH_TABLE,
570                                  p, s,
571                                  &t);
572         if (r < 0)
573                 return r;
574
575         f->field_hash_table = t;
576         return 0;
577 }
578
579 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
580         uint64_t p, h;
581         int r;
582
583         assert(f);
584         assert(o);
585         assert(offset > 0);
586         assert(o->object.type == OBJECT_DATA);
587
588         /* This might alter the window we are looking at */
589
590         o->data.next_hash_offset = o->data.next_field_offset = 0;
591         o->data.entry_offset = o->data.entry_array_offset = 0;
592         o->data.n_entries = 0;
593
594         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
595         p = le64toh(f->data_hash_table[h].tail_hash_offset);
596         if (p == 0) {
597                 /* Only entry in the hash table is easy */
598                 f->data_hash_table[h].head_hash_offset = htole64(offset);
599         } else {
600                 /* Move back to the previous data object, to patch in
601                  * pointer */
602
603                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
604                 if (r < 0)
605                         return r;
606
607                 o->data.next_hash_offset = htole64(offset);
608         }
609
610         f->data_hash_table[h].tail_hash_offset = htole64(offset);
611
612         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
613                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
614
615         return 0;
616 }
617
618 int journal_file_find_data_object_with_hash(
619                 JournalFile *f,
620                 const void *data, uint64_t size, uint64_t hash,
621                 Object **ret, uint64_t *offset) {
622
623         uint64_t p, osize, h;
624         int r;
625
626         assert(f);
627         assert(data || size == 0);
628
629         osize = offsetof(Object, data.payload) + size;
630
631         if (f->header->data_hash_table_size == 0)
632                 return -EBADMSG;
633
634         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
635         p = le64toh(f->data_hash_table[h].head_hash_offset);
636
637         while (p > 0) {
638                 Object *o;
639
640                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
641                 if (r < 0)
642                         return r;
643
644                 if (le64toh(o->data.hash) != hash)
645                         goto next;
646
647                 if (o->object.flags & OBJECT_COMPRESSED) {
648 #ifdef HAVE_XZ
649                         uint64_t l, rsize;
650
651                         l = le64toh(o->object.size);
652                         if (l <= offsetof(Object, data.payload))
653                                 return -EBADMSG;
654
655                         l -= offsetof(Object, data.payload);
656
657                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
658                                 return -EBADMSG;
659
660                         if (rsize == size &&
661                             memcmp(f->compress_buffer, data, size) == 0) {
662
663                                 if (ret)
664                                         *ret = o;
665
666                                 if (offset)
667                                         *offset = p;
668
669                                 return 1;
670                         }
671 #else
672                         return -EPROTONOSUPPORT;
673 #endif
674
675                 } else if (le64toh(o->object.size) == osize &&
676                            memcmp(o->data.payload, data, size) == 0) {
677
678                         if (ret)
679                                 *ret = o;
680
681                         if (offset)
682                                 *offset = p;
683
684                         return 1;
685                 }
686
687         next:
688                 p = le64toh(o->data.next_hash_offset);
689         }
690
691         return 0;
692 }
693
694 int journal_file_find_data_object(
695                 JournalFile *f,
696                 const void *data, uint64_t size,
697                 Object **ret, uint64_t *offset) {
698
699         uint64_t hash;
700
701         assert(f);
702         assert(data || size == 0);
703
704         hash = hash64(data, size);
705
706         return journal_file_find_data_object_with_hash(f,
707                                                        data, size, hash,
708                                                        ret, offset);
709 }
710
711 static int journal_file_append_data(
712                 JournalFile *f,
713                 const void *data, uint64_t size,
714                 Object **ret, uint64_t *offset) {
715
716         uint64_t hash, p;
717         uint64_t osize;
718         Object *o;
719         int r;
720         bool compressed = false;
721
722         assert(f);
723         assert(data || size == 0);
724
725         hash = hash64(data, size);
726
727         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
728         if (r < 0)
729                 return r;
730         else if (r > 0) {
731
732                 if (ret)
733                         *ret = o;
734
735                 if (offset)
736                         *offset = p;
737
738                 return 0;
739         }
740
741         osize = offsetof(Object, data.payload) + size;
742         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
743         if (r < 0)
744                 return r;
745
746         o->data.hash = htole64(hash);
747
748 #ifdef HAVE_XZ
749         if (f->compress &&
750             size >= COMPRESSION_SIZE_THRESHOLD) {
751                 uint64_t rsize;
752
753                 compressed = compress_blob(data, size, o->data.payload, &rsize);
754
755                 if (compressed) {
756                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
757                         o->object.flags |= OBJECT_COMPRESSED;
758
759                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
760                 }
761         }
762 #endif
763
764         if (!compressed && size > 0)
765                 memcpy(o->data.payload, data, size);
766
767         r = journal_file_link_data(f, o, p, hash);
768         if (r < 0)
769                 return r;
770
771         r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
772         if (r < 0)
773                 return r;
774
775         /* The linking might have altered the window, so let's
776          * refresh our pointer */
777         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
778         if (r < 0)
779                 return r;
780
781         if (ret)
782                 *ret = o;
783
784         if (offset)
785                 *offset = p;
786
787         return 0;
788 }
789
790 uint64_t journal_file_entry_n_items(Object *o) {
791         assert(o);
792         assert(o->object.type == OBJECT_ENTRY);
793
794         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
795 }
796
797 static uint64_t journal_file_entry_array_n_items(Object *o) {
798         assert(o);
799         assert(o->object.type == OBJECT_ENTRY_ARRAY);
800
801         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
802 }
803
804 static int link_entry_into_array(JournalFile *f,
805                                  le64_t *first,
806                                  le64_t *idx,
807                                  uint64_t p) {
808         int r;
809         uint64_t n = 0, ap = 0, q, i, a, hidx;
810         Object *o;
811
812         assert(f);
813         assert(first);
814         assert(idx);
815         assert(p > 0);
816
817         a = le64toh(*first);
818         i = hidx = le64toh(*idx);
819         while (a > 0) {
820
821                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
822                 if (r < 0)
823                         return r;
824
825                 n = journal_file_entry_array_n_items(o);
826                 if (i < n) {
827                         o->entry_array.items[i] = htole64(p);
828                         *idx = htole64(hidx + 1);
829                         return 0;
830                 }
831
832                 i -= n;
833                 ap = a;
834                 a = le64toh(o->entry_array.next_entry_array_offset);
835         }
836
837         if (hidx > n)
838                 n = (hidx+1) * 2;
839         else
840                 n = n * 2;
841
842         if (n < 4)
843                 n = 4;
844
845         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
846                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
847                                        &o, &q);
848         if (r < 0)
849                 return r;
850
851         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
852         if (r < 0)
853                 return r;
854
855         o->entry_array.items[i] = htole64(p);
856
857         if (ap == 0)
858                 *first = htole64(q);
859         else {
860                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
861                 if (r < 0)
862                         return r;
863
864                 o->entry_array.next_entry_array_offset = htole64(q);
865         }
866
867         *idx = htole64(hidx + 1);
868
869         return 0;
870 }
871
872 static int link_entry_into_array_plus_one(JournalFile *f,
873                                           le64_t *extra,
874                                           le64_t *first,
875                                           le64_t *idx,
876                                           uint64_t p) {
877
878         int r;
879
880         assert(f);
881         assert(extra);
882         assert(first);
883         assert(idx);
884         assert(p > 0);
885
886         if (*idx == 0)
887                 *extra = htole64(p);
888         else {
889                 le64_t i;
890
891                 i = htole64(le64toh(*idx) - 1);
892                 r = link_entry_into_array(f, first, &i, p);
893                 if (r < 0)
894                         return r;
895         }
896
897         *idx = htole64(le64toh(*idx) + 1);
898         return 0;
899 }
900
901 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
902         uint64_t p;
903         int r;
904         assert(f);
905         assert(o);
906         assert(offset > 0);
907
908         p = le64toh(o->entry.items[i].object_offset);
909         if (p == 0)
910                 return -EINVAL;
911
912         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
913         if (r < 0)
914                 return r;
915
916         return link_entry_into_array_plus_one(f,
917                                               &o->data.entry_offset,
918                                               &o->data.entry_array_offset,
919                                               &o->data.n_entries,
920                                               offset);
921 }
922
923 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
924         uint64_t n, i;
925         int r;
926
927         assert(f);
928         assert(o);
929         assert(offset > 0);
930         assert(o->object.type == OBJECT_ENTRY);
931
932         __sync_synchronize();
933
934         /* Link up the entry itself */
935         r = link_entry_into_array(f,
936                                   &f->header->entry_array_offset,
937                                   &f->header->n_entries,
938                                   offset);
939         if (r < 0)
940                 return r;
941
942         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
943
944         if (f->header->head_entry_realtime == 0)
945                 f->header->head_entry_realtime = o->entry.realtime;
946
947         f->header->tail_entry_realtime = o->entry.realtime;
948         f->header->tail_entry_monotonic = o->entry.monotonic;
949
950         f->tail_entry_monotonic_valid = true;
951
952         /* Link up the items */
953         n = journal_file_entry_n_items(o);
954         for (i = 0; i < n; i++) {
955                 r = journal_file_link_entry_item(f, o, offset, i);
956                 if (r < 0)
957                         return r;
958         }
959
960         return 0;
961 }
962
963 static int journal_file_append_entry_internal(
964                 JournalFile *f,
965                 const dual_timestamp *ts,
966                 uint64_t xor_hash,
967                 const EntryItem items[], unsigned n_items,
968                 uint64_t *seqnum,
969                 Object **ret, uint64_t *offset) {
970         uint64_t np;
971         uint64_t osize;
972         Object *o;
973         int r;
974
975         assert(f);
976         assert(items || n_items == 0);
977         assert(ts);
978
979         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
980
981         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
982         if (r < 0)
983                 return r;
984
985         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
986         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
987         o->entry.realtime = htole64(ts->realtime);
988         o->entry.monotonic = htole64(ts->monotonic);
989         o->entry.xor_hash = htole64(xor_hash);
990         o->entry.boot_id = f->header->boot_id;
991
992         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
993         if (r < 0)
994                 return r;
995
996         r = journal_file_link_entry(f, o, np);
997         if (r < 0)
998                 return r;
999
1000         if (ret)
1001                 *ret = o;
1002
1003         if (offset)
1004                 *offset = np;
1005
1006         return 0;
1007 }
1008
1009 void journal_file_post_change(JournalFile *f) {
1010         assert(f);
1011
1012         /* inotify() does not receive IN_MODIFY events from file
1013          * accesses done via mmap(). After each access we hence
1014          * trigger IN_MODIFY by truncating the journal file to its
1015          * current size which triggers IN_MODIFY. */
1016
1017         __sync_synchronize();
1018
1019         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1020                 log_error("Failed to to truncate file to its own size: %m");
1021 }
1022
1023 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1024         unsigned i;
1025         EntryItem *items;
1026         int r;
1027         uint64_t xor_hash = 0;
1028         struct dual_timestamp _ts;
1029
1030         assert(f);
1031         assert(iovec || n_iovec == 0);
1032
1033         if (!f->writable)
1034                 return -EPERM;
1035
1036         if (!ts) {
1037                 dual_timestamp_get(&_ts);
1038                 ts = &_ts;
1039         }
1040
1041         if (f->tail_entry_monotonic_valid &&
1042             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1043                 return -EINVAL;
1044
1045         r = journal_file_maybe_append_tag(f, ts->realtime);
1046         if (r < 0)
1047                 return r;
1048
1049         /* alloca() can't take 0, hence let's allocate at least one */
1050         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1051
1052         for (i = 0; i < n_iovec; i++) {
1053                 uint64_t p;
1054                 Object *o;
1055
1056                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1057                 if (r < 0)
1058                         return r;
1059
1060                 xor_hash ^= le64toh(o->data.hash);
1061                 items[i].object_offset = htole64(p);
1062                 items[i].hash = o->data.hash;
1063         }
1064
1065         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1066
1067         journal_file_post_change(f);
1068
1069         return r;
1070 }
1071
1072 static int generic_array_get(JournalFile *f,
1073                              uint64_t first,
1074                              uint64_t i,
1075                              Object **ret, uint64_t *offset) {
1076
1077         Object *o;
1078         uint64_t p = 0, a;
1079         int r;
1080
1081         assert(f);
1082
1083         a = first;
1084         while (a > 0) {
1085                 uint64_t n;
1086
1087                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1088                 if (r < 0)
1089                         return r;
1090
1091                 n = journal_file_entry_array_n_items(o);
1092                 if (i < n) {
1093                         p = le64toh(o->entry_array.items[i]);
1094                         break;
1095                 }
1096
1097                 i -= n;
1098                 a = le64toh(o->entry_array.next_entry_array_offset);
1099         }
1100
1101         if (a <= 0 || p <= 0)
1102                 return 0;
1103
1104         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1105         if (r < 0)
1106                 return r;
1107
1108         if (ret)
1109                 *ret = o;
1110
1111         if (offset)
1112                 *offset = p;
1113
1114         return 1;
1115 }
1116
1117 static int generic_array_get_plus_one(JournalFile *f,
1118                                       uint64_t extra,
1119                                       uint64_t first,
1120                                       uint64_t i,
1121                                       Object **ret, uint64_t *offset) {
1122
1123         Object *o;
1124
1125         assert(f);
1126
1127         if (i == 0) {
1128                 int r;
1129
1130                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1131                 if (r < 0)
1132                         return r;
1133
1134                 if (ret)
1135                         *ret = o;
1136
1137                 if (offset)
1138                         *offset = extra;
1139
1140                 return 1;
1141         }
1142
1143         return generic_array_get(f, first, i-1, ret, offset);
1144 }
1145
1146 enum {
1147         TEST_FOUND,
1148         TEST_LEFT,
1149         TEST_RIGHT
1150 };
1151
1152 static int generic_array_bisect(JournalFile *f,
1153                                 uint64_t first,
1154                                 uint64_t n,
1155                                 uint64_t needle,
1156                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1157                                 direction_t direction,
1158                                 Object **ret,
1159                                 uint64_t *offset,
1160                                 uint64_t *idx) {
1161
1162         uint64_t a, p, t = 0, i = 0, last_p = 0;
1163         bool subtract_one = false;
1164         Object *o, *array = NULL;
1165         int r;
1166
1167         assert(f);
1168         assert(test_object);
1169
1170         a = first;
1171         while (a > 0) {
1172                 uint64_t left, right, k, lp;
1173
1174                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1175                 if (r < 0)
1176                         return r;
1177
1178                 k = journal_file_entry_array_n_items(array);
1179                 right = MIN(k, n);
1180                 if (right <= 0)
1181                         return 0;
1182
1183                 i = right - 1;
1184                 lp = p = le64toh(array->entry_array.items[i]);
1185                 if (p <= 0)
1186                         return -EBADMSG;
1187
1188                 r = test_object(f, p, needle);
1189                 if (r < 0)
1190                         return r;
1191
1192                 if (r == TEST_FOUND)
1193                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1194
1195                 if (r == TEST_RIGHT) {
1196                         left = 0;
1197                         right -= 1;
1198                         for (;;) {
1199                                 if (left == right) {
1200                                         if (direction == DIRECTION_UP)
1201                                                 subtract_one = true;
1202
1203                                         i = left;
1204                                         goto found;
1205                                 }
1206
1207                                 assert(left < right);
1208
1209                                 i = (left + right) / 2;
1210                                 p = le64toh(array->entry_array.items[i]);
1211                                 if (p <= 0)
1212                                         return -EBADMSG;
1213
1214                                 r = test_object(f, p, needle);
1215                                 if (r < 0)
1216                                         return r;
1217
1218                                 if (r == TEST_FOUND)
1219                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1220
1221                                 if (r == TEST_RIGHT)
1222                                         right = i;
1223                                 else
1224                                         left = i + 1;
1225                         }
1226                 }
1227
1228                 if (k > n) {
1229                         if (direction == DIRECTION_UP) {
1230                                 i = n;
1231                                 subtract_one = true;
1232                                 goto found;
1233                         }
1234
1235                         return 0;
1236                 }
1237
1238                 last_p = lp;
1239
1240                 n -= k;
1241                 t += k;
1242                 a = le64toh(array->entry_array.next_entry_array_offset);
1243         }
1244
1245         return 0;
1246
1247 found:
1248         if (subtract_one && t == 0 && i == 0)
1249                 return 0;
1250
1251         if (subtract_one && i == 0)
1252                 p = last_p;
1253         else if (subtract_one)
1254                 p = le64toh(array->entry_array.items[i-1]);
1255         else
1256                 p = le64toh(array->entry_array.items[i]);
1257
1258         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1259         if (r < 0)
1260                 return r;
1261
1262         if (ret)
1263                 *ret = o;
1264
1265         if (offset)
1266                 *offset = p;
1267
1268         if (idx)
1269                 *idx = t + i + (subtract_one ? -1 : 0);
1270
1271         return 1;
1272 }
1273
1274 static int generic_array_bisect_plus_one(JournalFile *f,
1275                                          uint64_t extra,
1276                                          uint64_t first,
1277                                          uint64_t n,
1278                                          uint64_t needle,
1279                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1280                                          direction_t direction,
1281                                          Object **ret,
1282                                          uint64_t *offset,
1283                                          uint64_t *idx) {
1284
1285         int r;
1286         bool step_back = false;
1287         Object *o;
1288
1289         assert(f);
1290         assert(test_object);
1291
1292         if (n <= 0)
1293                 return 0;
1294
1295         /* This bisects the array in object 'first', but first checks
1296          * an extra  */
1297         r = test_object(f, extra, needle);
1298         if (r < 0)
1299                 return r;
1300
1301         if (r == TEST_FOUND)
1302                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1303
1304         /* if we are looking with DIRECTION_UP then we need to first
1305            see if in the actual array there is a matching entry, and
1306            return the last one of that. But if there isn't any we need
1307            to return this one. Hence remember this, and return it
1308            below. */
1309         if (r == TEST_LEFT)
1310                 step_back = direction == DIRECTION_UP;
1311
1312         if (r == TEST_RIGHT) {
1313                 if (direction == DIRECTION_DOWN)
1314                         goto found;
1315                 else
1316                         return 0;
1317         }
1318
1319         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1320
1321         if (r == 0 && step_back)
1322                 goto found;
1323
1324         if (r > 0 && idx)
1325                 (*idx) ++;
1326
1327         return r;
1328
1329 found:
1330         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1331         if (r < 0)
1332                 return r;
1333
1334         if (ret)
1335                 *ret = o;
1336
1337         if (offset)
1338                 *offset = extra;
1339
1340         if (idx)
1341                 *idx = 0;
1342
1343         return 1;
1344 }
1345
1346 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1347         assert(f);
1348         assert(p > 0);
1349
1350         if (p == needle)
1351                 return TEST_FOUND;
1352         else if (p < needle)
1353                 return TEST_LEFT;
1354         else
1355                 return TEST_RIGHT;
1356 }
1357
1358 int journal_file_move_to_entry_by_offset(
1359                 JournalFile *f,
1360                 uint64_t p,
1361                 direction_t direction,
1362                 Object **ret,
1363                 uint64_t *offset) {
1364
1365         return generic_array_bisect(f,
1366                                     le64toh(f->header->entry_array_offset),
1367                                     le64toh(f->header->n_entries),
1368                                     p,
1369                                     test_object_offset,
1370                                     direction,
1371                                     ret, offset, NULL);
1372 }
1373
1374
1375 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1376         Object *o;
1377         int r;
1378
1379         assert(f);
1380         assert(p > 0);
1381
1382         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1383         if (r < 0)
1384                 return r;
1385
1386         if (le64toh(o->entry.seqnum) == needle)
1387                 return TEST_FOUND;
1388         else if (le64toh(o->entry.seqnum) < needle)
1389                 return TEST_LEFT;
1390         else
1391                 return TEST_RIGHT;
1392 }
1393
1394 int journal_file_move_to_entry_by_seqnum(
1395                 JournalFile *f,
1396                 uint64_t seqnum,
1397                 direction_t direction,
1398                 Object **ret,
1399                 uint64_t *offset) {
1400
1401         return generic_array_bisect(f,
1402                                     le64toh(f->header->entry_array_offset),
1403                                     le64toh(f->header->n_entries),
1404                                     seqnum,
1405                                     test_object_seqnum,
1406                                     direction,
1407                                     ret, offset, NULL);
1408 }
1409
1410 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1411         Object *o;
1412         int r;
1413
1414         assert(f);
1415         assert(p > 0);
1416
1417         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1418         if (r < 0)
1419                 return r;
1420
1421         if (le64toh(o->entry.realtime) == needle)
1422                 return TEST_FOUND;
1423         else if (le64toh(o->entry.realtime) < needle)
1424                 return TEST_LEFT;
1425         else
1426                 return TEST_RIGHT;
1427 }
1428
1429 int journal_file_move_to_entry_by_realtime(
1430                 JournalFile *f,
1431                 uint64_t realtime,
1432                 direction_t direction,
1433                 Object **ret,
1434                 uint64_t *offset) {
1435
1436         return generic_array_bisect(f,
1437                                     le64toh(f->header->entry_array_offset),
1438                                     le64toh(f->header->n_entries),
1439                                     realtime,
1440                                     test_object_realtime,
1441                                     direction,
1442                                     ret, offset, NULL);
1443 }
1444
1445 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1446         Object *o;
1447         int r;
1448
1449         assert(f);
1450         assert(p > 0);
1451
1452         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1453         if (r < 0)
1454                 return r;
1455
1456         if (le64toh(o->entry.monotonic) == needle)
1457                 return TEST_FOUND;
1458         else if (le64toh(o->entry.monotonic) < needle)
1459                 return TEST_LEFT;
1460         else
1461                 return TEST_RIGHT;
1462 }
1463
1464 int journal_file_move_to_entry_by_monotonic(
1465                 JournalFile *f,
1466                 sd_id128_t boot_id,
1467                 uint64_t monotonic,
1468                 direction_t direction,
1469                 Object **ret,
1470                 uint64_t *offset) {
1471
1472         char t[9+32+1] = "_BOOT_ID=";
1473         Object *o;
1474         int r;
1475
1476         assert(f);
1477
1478         sd_id128_to_string(boot_id, t + 9);
1479         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1480         if (r < 0)
1481                 return r;
1482         if (r == 0)
1483                 return -ENOENT;
1484
1485         return generic_array_bisect_plus_one(f,
1486                                              le64toh(o->data.entry_offset),
1487                                              le64toh(o->data.entry_array_offset),
1488                                              le64toh(o->data.n_entries),
1489                                              monotonic,
1490                                              test_object_monotonic,
1491                                              direction,
1492                                              ret, offset, NULL);
1493 }
1494
1495 int journal_file_next_entry(
1496                 JournalFile *f,
1497                 Object *o, uint64_t p,
1498                 direction_t direction,
1499                 Object **ret, uint64_t *offset) {
1500
1501         uint64_t i, n;
1502         int r;
1503
1504         assert(f);
1505         assert(p > 0 || !o);
1506
1507         n = le64toh(f->header->n_entries);
1508         if (n <= 0)
1509                 return 0;
1510
1511         if (!o)
1512                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1513         else {
1514                 if (o->object.type != OBJECT_ENTRY)
1515                         return -EINVAL;
1516
1517                 r = generic_array_bisect(f,
1518                                          le64toh(f->header->entry_array_offset),
1519                                          le64toh(f->header->n_entries),
1520                                          p,
1521                                          test_object_offset,
1522                                          DIRECTION_DOWN,
1523                                          NULL, NULL,
1524                                          &i);
1525                 if (r <= 0)
1526                         return r;
1527
1528                 if (direction == DIRECTION_DOWN) {
1529                         if (i >= n - 1)
1530                                 return 0;
1531
1532                         i++;
1533                 } else {
1534                         if (i <= 0)
1535                                 return 0;
1536
1537                         i--;
1538                 }
1539         }
1540
1541         /* And jump to it */
1542         return generic_array_get(f,
1543                                  le64toh(f->header->entry_array_offset),
1544                                  i,
1545                                  ret, offset);
1546 }
1547
1548 int journal_file_skip_entry(
1549                 JournalFile *f,
1550                 Object *o, uint64_t p,
1551                 int64_t skip,
1552                 Object **ret, uint64_t *offset) {
1553
1554         uint64_t i, n;
1555         int r;
1556
1557         assert(f);
1558         assert(o);
1559         assert(p > 0);
1560
1561         if (o->object.type != OBJECT_ENTRY)
1562                 return -EINVAL;
1563
1564         r = generic_array_bisect(f,
1565                                  le64toh(f->header->entry_array_offset),
1566                                  le64toh(f->header->n_entries),
1567                                  p,
1568                                  test_object_offset,
1569                                  DIRECTION_DOWN,
1570                                  NULL, NULL,
1571                                  &i);
1572         if (r <= 0)
1573                 return r;
1574
1575         /* Calculate new index */
1576         if (skip < 0) {
1577                 if ((uint64_t) -skip >= i)
1578                         i = 0;
1579                 else
1580                         i = i - (uint64_t) -skip;
1581         } else
1582                 i  += (uint64_t) skip;
1583
1584         n = le64toh(f->header->n_entries);
1585         if (n <= 0)
1586                 return -EBADMSG;
1587
1588         if (i >= n)
1589                 i = n-1;
1590
1591         return generic_array_get(f,
1592                                  le64toh(f->header->entry_array_offset),
1593                                  i,
1594                                  ret, offset);
1595 }
1596
1597 int journal_file_next_entry_for_data(
1598                 JournalFile *f,
1599                 Object *o, uint64_t p,
1600                 uint64_t data_offset,
1601                 direction_t direction,
1602                 Object **ret, uint64_t *offset) {
1603
1604         uint64_t n, i;
1605         int r;
1606         Object *d;
1607
1608         assert(f);
1609         assert(p > 0 || !o);
1610
1611         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1612         if (r < 0)
1613                 return r;
1614
1615         n = le64toh(d->data.n_entries);
1616         if (n <= 0)
1617                 return n;
1618
1619         if (!o)
1620                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1621         else {
1622                 if (o->object.type != OBJECT_ENTRY)
1623                         return -EINVAL;
1624
1625                 r = generic_array_bisect_plus_one(f,
1626                                                   le64toh(d->data.entry_offset),
1627                                                   le64toh(d->data.entry_array_offset),
1628                                                   le64toh(d->data.n_entries),
1629                                                   p,
1630                                                   test_object_offset,
1631                                                   DIRECTION_DOWN,
1632                                                   NULL, NULL,
1633                                                   &i);
1634
1635                 if (r <= 0)
1636                         return r;
1637
1638                 if (direction == DIRECTION_DOWN) {
1639                         if (i >= n - 1)
1640                                 return 0;
1641
1642                         i++;
1643                 } else {
1644                         if (i <= 0)
1645                                 return 0;
1646
1647                         i--;
1648                 }
1649
1650         }
1651
1652         return generic_array_get_plus_one(f,
1653                                           le64toh(d->data.entry_offset),
1654                                           le64toh(d->data.entry_array_offset),
1655                                           i,
1656                                           ret, offset);
1657 }
1658
1659 int journal_file_move_to_entry_by_offset_for_data(
1660                 JournalFile *f,
1661                 uint64_t data_offset,
1662                 uint64_t p,
1663                 direction_t direction,
1664                 Object **ret, uint64_t *offset) {
1665
1666         int r;
1667         Object *d;
1668
1669         assert(f);
1670
1671         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1672         if (r < 0)
1673                 return r;
1674
1675         return generic_array_bisect_plus_one(f,
1676                                              le64toh(d->data.entry_offset),
1677                                              le64toh(d->data.entry_array_offset),
1678                                              le64toh(d->data.n_entries),
1679                                              p,
1680                                              test_object_offset,
1681                                              direction,
1682                                              ret, offset, NULL);
1683 }
1684
1685 int journal_file_move_to_entry_by_monotonic_for_data(
1686                 JournalFile *f,
1687                 uint64_t data_offset,
1688                 sd_id128_t boot_id,
1689                 uint64_t monotonic,
1690                 direction_t direction,
1691                 Object **ret, uint64_t *offset) {
1692
1693         char t[9+32+1] = "_BOOT_ID=";
1694         Object *o, *d;
1695         int r;
1696         uint64_t b, z;
1697
1698         assert(f);
1699
1700         /* First, seek by time */
1701         sd_id128_to_string(boot_id, t + 9);
1702         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1703         if (r < 0)
1704                 return r;
1705         if (r == 0)
1706                 return -ENOENT;
1707
1708         r = generic_array_bisect_plus_one(f,
1709                                           le64toh(o->data.entry_offset),
1710                                           le64toh(o->data.entry_array_offset),
1711                                           le64toh(o->data.n_entries),
1712                                           monotonic,
1713                                           test_object_monotonic,
1714                                           direction,
1715                                           NULL, &z, NULL);
1716         if (r <= 0)
1717                 return r;
1718
1719         /* And now, continue seeking until we find an entry that
1720          * exists in both bisection arrays */
1721
1722         for (;;) {
1723                 Object *qo;
1724                 uint64_t p, q;
1725
1726                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1727                 if (r < 0)
1728                         return r;
1729
1730                 r = generic_array_bisect_plus_one(f,
1731                                                   le64toh(d->data.entry_offset),
1732                                                   le64toh(d->data.entry_array_offset),
1733                                                   le64toh(d->data.n_entries),
1734                                                   z,
1735                                                   test_object_offset,
1736                                                   direction,
1737                                                   NULL, &p, NULL);
1738                 if (r <= 0)
1739                         return r;
1740
1741                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1742                 if (r < 0)
1743                         return r;
1744
1745                 r = generic_array_bisect_plus_one(f,
1746                                                   le64toh(o->data.entry_offset),
1747                                                   le64toh(o->data.entry_array_offset),
1748                                                   le64toh(o->data.n_entries),
1749                                                   p,
1750                                                   test_object_offset,
1751                                                   direction,
1752                                                   &qo, &q, NULL);
1753
1754                 if (r <= 0)
1755                         return r;
1756
1757                 if (p == q) {
1758                         if (ret)
1759                                 *ret = qo;
1760                         if (offset)
1761                                 *offset = q;
1762
1763                         return 1;
1764                 }
1765
1766                 z = q;
1767         }
1768
1769         return 0;
1770 }
1771
1772 int journal_file_move_to_entry_by_seqnum_for_data(
1773                 JournalFile *f,
1774                 uint64_t data_offset,
1775                 uint64_t seqnum,
1776                 direction_t direction,
1777                 Object **ret, uint64_t *offset) {
1778
1779         Object *d;
1780         int r;
1781
1782         assert(f);
1783
1784         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1785         if (r < 0)
1786                 return r;
1787
1788         return generic_array_bisect_plus_one(f,
1789                                              le64toh(d->data.entry_offset),
1790                                              le64toh(d->data.entry_array_offset),
1791                                              le64toh(d->data.n_entries),
1792                                              seqnum,
1793                                              test_object_seqnum,
1794                                              direction,
1795                                              ret, offset, NULL);
1796 }
1797
1798 int journal_file_move_to_entry_by_realtime_for_data(
1799                 JournalFile *f,
1800                 uint64_t data_offset,
1801                 uint64_t realtime,
1802                 direction_t direction,
1803                 Object **ret, uint64_t *offset) {
1804
1805         Object *d;
1806         int r;
1807
1808         assert(f);
1809
1810         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1811         if (r < 0)
1812                 return r;
1813
1814         return generic_array_bisect_plus_one(f,
1815                                              le64toh(d->data.entry_offset),
1816                                              le64toh(d->data.entry_array_offset),
1817                                              le64toh(d->data.n_entries),
1818                                              realtime,
1819                                              test_object_realtime,
1820                                              direction,
1821                                              ret, offset, NULL);
1822 }
1823
1824 static void *fsprg_state(JournalFile *f) {
1825         uint64_t a, b;
1826         assert(f);
1827
1828         if (!f->authenticate)
1829                 return NULL;
1830
1831         a = le64toh(f->fsprg_header->header_size);
1832         b = le64toh(f->fsprg_header->state_size);
1833
1834         if (a + b > f->fsprg_size)
1835                 return NULL;
1836
1837         return (uint8_t*) f->fsprg_header + a;
1838 }
1839
1840 static uint64_t journal_file_tag_seqnum(JournalFile *f) {
1841         uint64_t r;
1842
1843         assert(f);
1844
1845         r = le64toh(f->header->n_tags) + 1;
1846         f->header->n_tags = htole64(r);
1847
1848         return r;
1849 }
1850
1851 int journal_file_append_tag(JournalFile *f) {
1852         Object *o;
1853         uint64_t p;
1854         int r;
1855
1856         assert(f);
1857
1858         if (!f->authenticate)
1859                 return 0;
1860
1861         if (!f->hmac_running)
1862                 return 0;
1863
1864         log_debug("Writing tag for epoch %llu\n", (unsigned long long) FSPRG_GetEpoch(fsprg_state(f)));
1865
1866         assert(f->hmac);
1867
1868         r = journal_file_append_object(f, OBJECT_TAG, sizeof(struct TagObject), &o, &p);
1869         if (r < 0)
1870                 return r;
1871
1872         o->tag.seqnum = htole64(journal_file_tag_seqnum(f));
1873
1874         /* Add the tag object itself, so that we can protect its
1875          * header. This will exclude the actual hash value in it */
1876         r = journal_file_hmac_put_object(f, OBJECT_TAG, p);
1877         if (r < 0)
1878                 return r;
1879
1880         /* Get the HMAC tag and store it in the object */
1881         memcpy(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH);
1882         f->hmac_running = false;
1883
1884         return 0;
1885 }
1886
1887 static int journal_file_hmac_start(JournalFile *f) {
1888         uint8_t key[256 / 8]; /* Let's pass 256 bit from FSPRG to HMAC */
1889
1890         assert(f);
1891
1892         if (!f->authenticate)
1893                 return 0;
1894
1895         if (f->hmac_running)
1896                 return 0;
1897
1898         /* Prepare HMAC for next cycle */
1899         gcry_md_reset(f->hmac);
1900         FSPRG_GetKey(fsprg_state(f), key, sizeof(key), 0);
1901         gcry_md_setkey(f->hmac, key, sizeof(key));
1902
1903         f->hmac_running = true;
1904
1905         return 0;
1906 }
1907
1908 static int journal_file_get_epoch(JournalFile *f, uint64_t realtime, uint64_t *epoch) {
1909         uint64_t t;
1910
1911         assert(f);
1912         assert(epoch);
1913         assert(f->authenticate);
1914
1915         if (le64toh(f->fsprg_header->fsprg_start_usec) == 0 ||
1916             le64toh(f->fsprg_header->fsprg_interval_usec) == 0)
1917                 return -ENOTSUP;
1918
1919         if (realtime < le64toh(f->fsprg_header->fsprg_start_usec))
1920                 return -ESTALE;
1921
1922         t = realtime - le64toh(f->fsprg_header->fsprg_start_usec);
1923         t = t / le64toh(f->fsprg_header->fsprg_interval_usec);
1924
1925         *epoch = t;
1926         return 0;
1927 }
1928
1929 static int journal_file_need_evolve(JournalFile *f, uint64_t realtime) {
1930         uint64_t goal, epoch;
1931         int r;
1932         assert(f);
1933
1934         if (!f->authenticate)
1935                 return 0;
1936
1937         r = journal_file_get_epoch(f, realtime, &goal);
1938         if (r < 0)
1939                 return r;
1940
1941         epoch = FSPRG_GetEpoch(fsprg_state(f));
1942         if (epoch > goal)
1943                 return -ESTALE;
1944
1945         return epoch != goal;
1946 }
1947
1948 static int journal_file_evolve(JournalFile *f, uint64_t realtime) {
1949         uint64_t goal, epoch;
1950         int r;
1951
1952         assert(f);
1953
1954         if (!f->authenticate)
1955                 return 0;
1956
1957         r = journal_file_get_epoch(f, realtime, &goal);
1958         if (r < 0)
1959                 return r;
1960
1961         epoch = FSPRG_GetEpoch(fsprg_state(f));
1962         if (epoch < goal)
1963                 log_debug("Evolving FSPRG key from epoch %llu to %llu.", (unsigned long long) epoch, (unsigned long long) goal);
1964
1965         for (;;) {
1966                 if (epoch > goal)
1967                         return -ESTALE;
1968                 if (epoch == goal)
1969                         return 0;
1970
1971                 FSPRG_Evolve(fsprg_state(f));
1972                 epoch = FSPRG_GetEpoch(fsprg_state(f));
1973         }
1974 }
1975
1976 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime) {
1977         int r;
1978
1979         assert(f);
1980
1981         if (!f->authenticate)
1982                 return 0;
1983
1984         r = journal_file_need_evolve(f, realtime);
1985         if (r <= 0)
1986                 return 0;
1987
1988         r = journal_file_append_tag(f);
1989         if (r < 0)
1990                 return r;
1991
1992         r = journal_file_evolve(f, realtime);
1993         if (r < 0)
1994                 return r;
1995
1996         r = journal_file_hmac_start(f);
1997         if (r < 0)
1998                 return r;
1999
2000         return 0;
2001 }
2002
2003 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p) {
2004         int r;
2005         Object *o;
2006
2007         assert(f);
2008
2009         if (!f->authenticate)
2010                 return 0;
2011
2012         r = journal_file_hmac_start(f);
2013         if (r < 0)
2014                 return r;
2015
2016         r = journal_file_move_to_object(f, type, p, &o);
2017         if (r < 0)
2018                 return r;
2019
2020         gcry_md_write(f->hmac, o, offsetof(ObjectHeader, payload));
2021
2022         switch (o->object.type) {
2023
2024         case OBJECT_DATA:
2025                 /* All but: hash and payload are mutable */
2026                 gcry_md_write(f->hmac, &o->data.hash, sizeof(o->data.hash));
2027                 gcry_md_write(f->hmac, o->data.payload, le64toh(o->object.size) - offsetof(DataObject, payload));
2028                 break;
2029
2030         case OBJECT_ENTRY:
2031                 /* All */
2032                 gcry_md_write(f->hmac, &o->entry.seqnum, le64toh(o->object.size) - offsetof(EntryObject, seqnum));
2033                 break;
2034
2035         case OBJECT_FIELD_HASH_TABLE:
2036         case OBJECT_DATA_HASH_TABLE:
2037         case OBJECT_ENTRY_ARRAY:
2038                 /* Nothing: everything is mutable */
2039                 break;
2040
2041         case OBJECT_TAG:
2042                 /* All but the tag itself */
2043                 gcry_md_write(f->hmac, &o->tag.seqnum, sizeof(o->tag.seqnum));
2044                 break;
2045         default:
2046                 return -EINVAL;
2047         }
2048
2049         return 0;
2050 }
2051
2052 static int journal_file_hmac_put_header(JournalFile *f) {
2053         int r;
2054
2055         assert(f);
2056
2057         if (!f->authenticate)
2058                 return 0;
2059
2060         r = journal_file_hmac_start(f);
2061         if (r < 0)
2062                 return r;
2063
2064         /* All but state+reserved, boot_id, arena_size,
2065          * tail_object_offset, n_objects, n_entries, tail_seqnum,
2066          * head_entry_realtime, tail_entry_realtime,
2067          * tail_entry_monotonic, n_data, n_fields, header_tag */
2068
2069         gcry_md_write(f->hmac, f->header->signature, offsetof(Header, state) - offsetof(Header, signature));
2070         gcry_md_write(f->hmac, &f->header->file_id, offsetof(Header, boot_id) - offsetof(Header, file_id));
2071         gcry_md_write(f->hmac, &f->header->seqnum_id, offsetof(Header, arena_size) - offsetof(Header, seqnum_id));
2072         gcry_md_write(f->hmac, &f->header->data_hash_table_offset, offsetof(Header, tail_object_offset) - offsetof(Header, data_hash_table_offset));
2073         gcry_md_write(f->hmac, &f->header->head_entry_seqnum, offsetof(Header, head_entry_realtime) - offsetof(Header, head_entry_seqnum));
2074
2075         return 0;
2076 }
2077
2078 static int journal_file_load_fsprg(JournalFile *f) {
2079         int r, fd = -1;
2080         char *p = NULL;
2081         struct stat st;
2082         FSPRGHeader *m = NULL;
2083         sd_id128_t machine;
2084
2085         assert(f);
2086
2087         if (!f->authenticate)
2088                 return 0;
2089
2090         r = sd_id128_get_machine(&machine);
2091         if (r < 0)
2092                 return r;
2093
2094         if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/fsprg",
2095                      SD_ID128_FORMAT_VAL(machine)) < 0)
2096                 return -ENOMEM;
2097
2098         fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY, 0600);
2099         if (fd < 0) {
2100                 log_error("Failed to open %s: %m", p);
2101                 r = -errno;
2102                 goto finish;
2103         }
2104
2105         if (fstat(fd, &st) < 0) {
2106                 r = -errno;
2107                 goto finish;
2108         }
2109
2110         if (st.st_size < (off_t) sizeof(FSPRGHeader)) {
2111                 r = -ENODATA;
2112                 goto finish;
2113         }
2114
2115         m = mmap(NULL, PAGE_ALIGN(sizeof(FSPRGHeader)), PROT_READ, MAP_SHARED, fd, 0);
2116         if (m == MAP_FAILED) {
2117                 m = NULL;
2118                 r = -errno;
2119                 goto finish;
2120         }
2121
2122         if (memcmp(m->signature, FSPRG_HEADER_SIGNATURE, 8) != 0) {
2123                 r = -EBADMSG;
2124                 goto finish;
2125         }
2126
2127         if (m->incompatible_flags != 0) {
2128                 r = -EPROTONOSUPPORT;
2129                 goto finish;
2130         }
2131
2132         if (le64toh(m->header_size) < sizeof(FSPRGHeader)) {
2133                 r = -EBADMSG;
2134                 goto finish;
2135         }
2136
2137         if (le64toh(m->state_size) != FSPRG_stateinbytes(m->secpar)) {
2138                 r = -EBADMSG;
2139                 goto finish;
2140         }
2141
2142         f->fsprg_size = le64toh(m->header_size) + le64toh(m->state_size);
2143         if ((uint64_t) st.st_size < f->fsprg_size) {
2144                 r = -ENODATA;
2145                 goto finish;
2146         }
2147
2148         if (!sd_id128_equal(machine, m->machine_id)) {
2149                 r = -EHOSTDOWN;
2150                 goto finish;
2151         }
2152
2153         if (le64toh(m->fsprg_start_usec) <= 0 ||
2154             le64toh(m->fsprg_interval_usec) <= 0) {
2155                 r = -EBADMSG;
2156                 goto finish;
2157         }
2158
2159         f->fsprg_header = mmap(NULL, PAGE_ALIGN(f->fsprg_size), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2160         if (f->fsprg_header == MAP_FAILED) {
2161                 f->fsprg_header = NULL;
2162                 r = -errno;
2163                 goto finish;
2164         }
2165
2166         r = 0;
2167
2168 finish:
2169         if (m)
2170                 munmap(m, PAGE_ALIGN(sizeof(FSPRGHeader)));
2171
2172         if (fd >= 0)
2173                 close_nointr_nofail(fd);
2174
2175         free(p);
2176         return r;
2177 }
2178
2179 static int journal_file_setup_hmac(JournalFile *f) {
2180         gcry_error_t e;
2181
2182         if (!f->authenticate)
2183                 return 0;
2184
2185         e = gcry_md_open(&f->hmac, GCRY_MD_SHA256, GCRY_MD_FLAG_HMAC);
2186         if (e != 0)
2187                 return -ENOTSUP;
2188
2189         return 0;
2190 }
2191
2192 static int journal_file_append_first_tag(JournalFile *f) {
2193         int r;
2194         uint64_t p;
2195
2196         if (!f->authenticate)
2197                 return 0;
2198
2199         log_debug("Calculating first tag...");
2200
2201         r = journal_file_hmac_put_header(f);
2202         if (r < 0)
2203                 return r;
2204
2205         p = le64toh(f->header->field_hash_table_offset);
2206         if (p < offsetof(Object, hash_table.items))
2207                 return -EINVAL;
2208         p -= offsetof(Object, hash_table.items);
2209
2210         r = journal_file_hmac_put_object(f, OBJECT_FIELD_HASH_TABLE, p);
2211         if (r < 0)
2212                 return r;
2213
2214         p = le64toh(f->header->data_hash_table_offset);
2215         if (p < offsetof(Object, hash_table.items))
2216                 return -EINVAL;
2217         p -= offsetof(Object, hash_table.items);
2218
2219         r = journal_file_hmac_put_object(f, OBJECT_DATA_HASH_TABLE, p);
2220         if (r < 0)
2221                 return r;
2222
2223         r = journal_file_append_tag(f);
2224         if (r < 0)
2225                 return r;
2226
2227         return 0;
2228 }
2229
2230 static int journal_file_object_verify(JournalFile *f, Object *o) {
2231         assert(f);
2232         assert(o);
2233
2234         /* This does various superficial tests about the length an
2235          * possible field values. It does not follow any references to
2236          * other objects. */
2237
2238         switch (o->object.type) {
2239         case OBJECT_DATA:
2240                 if (le64toh(o->data.entry_offset) <= 0 ||
2241                     le64toh(o->data.n_entries) <= 0)
2242                         return -EBADMSG;
2243
2244                 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0)
2245                         return -EBADMSG;
2246                 break;
2247
2248         case OBJECT_FIELD:
2249                 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0)
2250                         return -EBADMSG;
2251                 break;
2252
2253         case OBJECT_ENTRY:
2254                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
2255                         return -EBADMSG;
2256
2257                 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
2258                         return -EBADMSG;
2259
2260                 if (le64toh(o->entry.seqnum) <= 0 ||
2261                     le64toh(o->entry.realtime) <= 0)
2262                         return -EBADMSG;
2263
2264                 break;
2265
2266         case OBJECT_DATA_HASH_TABLE:
2267         case OBJECT_FIELD_HASH_TABLE:
2268                 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0)
2269                         return -EBADMSG;
2270
2271                 break;
2272
2273         case OBJECT_ENTRY_ARRAY:
2274                 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0)
2275                         return -EBADMSG;
2276
2277                 break;
2278
2279         case OBJECT_TAG:
2280                 if (le64toh(o->object.size) != sizeof(TagObject))
2281                         return -EBADMSG;
2282                 break;
2283         }
2284
2285         return 0;
2286 }
2287
2288 static void draw_progress(uint64_t p, usec_t *last_usec) {
2289         unsigned n, i, j, k;
2290         usec_t z, x;
2291
2292         if (!isatty(STDOUT_FILENO))
2293                 return;
2294
2295         z = now(CLOCK_MONOTONIC);
2296         x = *last_usec;
2297
2298         if (x != 0 && x + 40 * USEC_PER_MSEC > z)
2299                 return;
2300
2301         *last_usec = z;
2302
2303         n = (3 * columns()) / 4;
2304         j = (n * (unsigned) p) / 65535ULL;
2305         k = n - j;
2306
2307         fputs("\r\x1B[?25l", stdout);
2308
2309         for (i = 0; i < j; i++)
2310                 fputs("\xe2\x96\x88", stdout);
2311
2312         for (i = 0; i < k; i++)
2313                 fputs("\xe2\x96\x91", stdout);
2314
2315         printf(" %3lu%%", 100LU * (unsigned long) p / 65535LU);
2316
2317         fputs("\r\x1B[?25h", stdout);
2318         fflush(stdout);
2319 }
2320
2321 static void flush_progress(void) {
2322         unsigned n, i;
2323
2324         if (!isatty(STDOUT_FILENO))
2325                 return;
2326
2327         n = (3 * columns()) / 4;
2328
2329         putchar('\r');
2330
2331         for (i = 0; i < n + 5; i++)
2332                 putchar(' ');
2333
2334         putchar('\r');
2335         fflush(stdout);
2336 }
2337
2338 int journal_file_verify(JournalFile *f, const char *key) {
2339         int r;
2340         Object *o;
2341         uint64_t p = 0, q = 0, e;
2342         uint64_t tag_seqnum = 0, entry_seqnum = 0, entry_monotonic = 0, entry_realtime = 0;
2343         sd_id128_t entry_boot_id;
2344         bool entry_seqnum_set = false, entry_monotonic_set = false, entry_realtime_set = false, found_main_entry_array = false;
2345         uint64_t n_weird = 0, n_objects = 0, n_entries = 0, n_data = 0, n_fields = 0, n_data_hash_tables = 0, n_field_hash_tables = 0;
2346         usec_t last_usec = 0;
2347
2348         assert(f);
2349
2350         /* First iteration: we go through all objects, verify the
2351          * superficial structure, headers, hashes. */
2352
2353         r = journal_file_hmac_put_header(f);
2354         if (r < 0) {
2355                 log_error("Failed to calculate HMAC of header.");
2356                 goto fail;
2357         }
2358
2359         p = le64toh(f->header->header_size);
2360         while (p != 0) {
2361                 draw_progress((65535ULL * p / le64toh(f->header->tail_object_offset)), &last_usec);
2362
2363                 r = journal_file_move_to_object(f, -1, p, &o);
2364                 if (r < 0) {
2365                         log_error("Invalid object at %llu", (unsigned long long) p);
2366                         goto fail;
2367                 }
2368
2369                 if (le64toh(f->header->tail_object_offset) < p) {
2370                         log_error("Invalid tail object pointer.");
2371                         r = -EBADMSG;
2372                         goto fail;
2373                 }
2374
2375                 n_objects ++;
2376
2377                 r = journal_file_object_verify(f, o);
2378                 if (r < 0) {
2379                         log_error("Invalid object contents at %llu", (unsigned long long) p);
2380                         goto fail;
2381                 }
2382
2383                 r = journal_file_hmac_put_object(f, -1, p);
2384                 if (r < 0) {
2385                         log_error("Failed to calculate HMAC at %llu", (unsigned long long) p);
2386                         goto fail;
2387                 }
2388
2389                 if (o->object.flags & OBJECT_COMPRESSED &&
2390                     !(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED)) {
2391                         log_error("Compressed object without compression at %llu", (unsigned long long) p);
2392                         r = -EBADMSG;
2393                         goto fail;
2394                 }
2395
2396                 if (o->object.flags & OBJECT_COMPRESSED &&
2397                     o->object.type != OBJECT_DATA) {
2398                         log_error("Compressed non-data object at %llu", (unsigned long long) p);
2399                         r = -EBADMSG;
2400                         goto fail;
2401                 }
2402
2403                 if (o->object.type == OBJECT_TAG) {
2404
2405                         if (!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED)) {
2406                                 log_error("Tag object without authentication at %llu", (unsigned long long) p);
2407                                 r = -EBADMSG;
2408                                 goto fail;
2409                         }
2410
2411                         if (le64toh(o->tag.seqnum) != tag_seqnum) {
2412                                 log_error("Tag sequence number out of synchronization at %llu", (unsigned long long) p);
2413                                 r = -EBADMSG;
2414                                 goto fail;
2415                         }
2416
2417                 } else if (o->object.type == OBJECT_ENTRY) {
2418
2419                         if (!entry_seqnum_set &&
2420                             le64toh(o->entry.seqnum) != le64toh(f->header->head_entry_seqnum)) {
2421                                 log_error("Head entry sequence number incorrect");
2422                                 r = -EBADMSG;
2423                                 goto fail;
2424                         }
2425
2426                         if (entry_seqnum_set &&
2427                             entry_seqnum >= le64toh(o->entry.seqnum)) {
2428                                 log_error("Entry sequence number out of synchronization at %llu", (unsigned long long) p);
2429                                 r = -EBADMSG;
2430                                 goto fail;
2431                         }
2432
2433                         entry_seqnum = le64toh(o->entry.seqnum);
2434                         entry_seqnum_set = true;
2435
2436                         if (entry_monotonic_set &&
2437                             sd_id128_equal(entry_boot_id, o->entry.boot_id) &&
2438                             entry_monotonic > le64toh(o->entry.monotonic)) {
2439                                 log_error("Entry timestamp out of synchronization at %llu", (unsigned long long) p);
2440                                 r = -EBADMSG;
2441                                 goto fail;
2442                         }
2443
2444                         entry_monotonic = le64toh(o->entry.monotonic);
2445                         entry_boot_id = o->entry.boot_id;
2446                         entry_monotonic_set = true;
2447
2448                         if (!entry_realtime_set &&
2449                             le64toh(o->entry.realtime) != le64toh(f->header->head_entry_realtime)) {
2450                                 log_error("Head entry realtime timestamp incorrect");
2451                                 r = -EBADMSG;
2452                                 goto fail;
2453                         }
2454
2455                         entry_realtime = le64toh(o->entry.realtime);
2456                         entry_realtime_set = true;
2457
2458                         n_entries ++;
2459                 } else if (o->object.type == OBJECT_ENTRY_ARRAY) {
2460
2461                         if (p == le64toh(f->header->entry_array_offset)) {
2462                                 if (found_main_entry_array) {
2463                                         log_error("More than one main entry array at %llu", (unsigned long long) p);
2464                                         r = -EBADMSG;
2465                                         goto fail;
2466                                 }
2467
2468                                 found_main_entry_array = true;
2469                         }
2470
2471                 } else if (o->object.type == OBJECT_DATA)
2472                         n_data++;
2473                 else if (o->object.type == OBJECT_FIELD)
2474                         n_fields++;
2475                 else if (o->object.type == OBJECT_DATA_HASH_TABLE) {
2476                         n_data_hash_tables++;
2477
2478                         if (n_data_hash_tables > 1) {
2479                                 log_error("More than one data hash table at %llu", (unsigned long long) p);
2480                                 r = -EBADMSG;
2481                                 goto fail;
2482                         }
2483
2484                         if (le64toh(f->header->data_hash_table_offset) != p + offsetof(HashTableObject, items) ||
2485                             le64toh(f->header->data_hash_table_size) != le64toh(o->object.size) - offsetof(HashTableObject, items)) {
2486                                 log_error("Header fields for data hash table invalid.");
2487                                 r = -EBADMSG;
2488                                 goto fail;
2489                         }
2490                 } else if (o->object.type == OBJECT_FIELD_HASH_TABLE) {
2491                         n_field_hash_tables++;
2492
2493                         if (n_field_hash_tables > 1) {
2494                                 log_error("More than one field hash table at %llu", (unsigned long long) p);
2495                                 r = -EBADMSG;
2496                                 goto fail;
2497                         }
2498
2499                         if (le64toh(f->header->field_hash_table_offset) != p + offsetof(HashTableObject, items) ||
2500                             le64toh(f->header->field_hash_table_size) != le64toh(o->object.size) - offsetof(HashTableObject, items)) {
2501                                 log_error("Header fields for field hash table invalid.");
2502                                 r = -EBADMSG;
2503                                 goto fail;
2504                         }
2505                 }
2506
2507                 if (o->object.type >= _OBJECT_TYPE_MAX)
2508                         n_weird ++;
2509                 else {
2510                         /* Write address to file... */
2511
2512                 }
2513
2514                 if (p == le64toh(f->header->tail_object_offset))
2515                         p = 0;
2516                 else
2517                         p = p + ALIGN64(le64toh(o->object.size));
2518         }
2519
2520         if (n_objects != le64toh(f->header->n_objects)) {
2521                 log_error("Object number mismatch");
2522                 r = -EBADMSG;
2523                 goto fail;
2524         }
2525
2526         if (n_entries != le64toh(f->header->n_entries)) {
2527                 log_error("Entry number mismatch");
2528                 r = -EBADMSG;
2529                 goto fail;
2530         }
2531
2532         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2533             n_data != le64toh(f->header->n_data)) {
2534                 log_error("Data number mismatch");
2535                 r = -EBADMSG;
2536                 goto fail;
2537         }
2538
2539         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2540             n_fields != le64toh(f->header->n_fields)) {
2541                 log_error("Field number mismatch");
2542                 r = -EBADMSG;
2543                 goto fail;
2544         }
2545
2546         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags) &&
2547             tag_seqnum != le64toh(f->header->n_tags)) {
2548                 log_error("Tag number mismatch");
2549                 r = -EBADMSG;
2550                 goto fail;
2551         }
2552
2553         if (n_data_hash_tables != 1) {
2554                 log_error("Missing data hash table");
2555                 r = -EBADMSG;
2556                 goto fail;
2557         }
2558
2559         if (n_field_hash_tables != 1) {
2560                 log_error("Missing field hash table");
2561                 r = -EBADMSG;
2562                 goto fail;
2563         }
2564
2565         if (!found_main_entry_array) {
2566                 log_error("Missing entry array");
2567                 r = -EBADMSG;
2568                 goto fail;
2569         }
2570
2571         if (entry_seqnum_set &&
2572             entry_seqnum != le64toh(f->header->tail_entry_seqnum)) {
2573                 log_error("Invalid tail seqnum");
2574                 r = -EBADMSG;
2575                 goto fail;
2576         }
2577
2578         if (entry_monotonic_set &&
2579             (!sd_id128_equal(entry_boot_id, f->header->boot_id) ||
2580              entry_monotonic != le64toh(f->header->tail_entry_monotonic))) {
2581                 log_error("Invalid tail monotonic timestamp");
2582                 r = -EBADMSG;
2583                 goto fail;
2584         }
2585
2586         if (entry_realtime_set && entry_realtime != le64toh(f->header->tail_entry_realtime)) {
2587                 log_error("Invalid tail realtime timestamp");
2588                 r = -EBADMSG;
2589                 goto fail;
2590         }
2591
2592         /* Second iteration: we go through all objects again, this
2593          * time verify all pointers. */
2594
2595         /* q = le64toh(f->header->header_size); */
2596         /* while (q != 0) { */
2597         /*         r = journal_file_move_to_object(f, -1, q, &o); */
2598         /*         if (r < 0) { */
2599         /*                 log_error("Invalid object at %llu", (unsigned long long) q); */
2600         /*                 goto fail; */
2601         /*         } */
2602
2603         /*         if (q == le64toh(f->header->tail_object_offset)) */
2604         /*                 q = 0; */
2605         /*         else */
2606         /*                 q = q + ALIGN64(le64toh(o->object.size)); */
2607         /* } */
2608
2609         flush_progress();
2610
2611         return 0;
2612
2613 fail:
2614         e = p <= 0 ? q :
2615         q <= 0 ? p :
2616         MIN(p, q);
2617
2618         flush_progress();
2619
2620         log_error("File corruption detected at %s:%llu (of %llu, %llu%%).",
2621                   f->path,
2622                   (unsigned long long) e,
2623                   (unsigned long long) f->last_stat.st_size,
2624                   (unsigned long long) (100 * e / f->last_stat.st_size));
2625
2626         return r;
2627 }
2628
2629 void journal_file_dump(JournalFile *f) {
2630         Object *o;
2631         int r;
2632         uint64_t p;
2633
2634         assert(f);
2635
2636         journal_file_print_header(f);
2637
2638         p = le64toh(f->header->header_size);
2639         while (p != 0) {
2640                 r = journal_file_move_to_object(f, -1, p, &o);
2641                 if (r < 0)
2642                         goto fail;
2643
2644                 switch (o->object.type) {
2645
2646                 case OBJECT_UNUSED:
2647                         printf("Type: OBJECT_UNUSED\n");
2648                         break;
2649
2650                 case OBJECT_DATA:
2651                         printf("Type: OBJECT_DATA\n");
2652                         break;
2653
2654                 case OBJECT_ENTRY:
2655                         printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
2656                                (unsigned long long) le64toh(o->entry.seqnum),
2657                                (unsigned long long) le64toh(o->entry.monotonic),
2658                                (unsigned long long) le64toh(o->entry.realtime));
2659                         break;
2660
2661                 case OBJECT_FIELD_HASH_TABLE:
2662                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2663                         break;
2664
2665                 case OBJECT_DATA_HASH_TABLE:
2666                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2667                         break;
2668
2669                 case OBJECT_ENTRY_ARRAY:
2670                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2671                         break;
2672
2673                 case OBJECT_TAG:
2674                         printf("Type: OBJECT_TAG %llu\n",
2675                                (unsigned long long) le64toh(o->tag.seqnum));
2676                         break;
2677                 }
2678
2679                 if (o->object.flags & OBJECT_COMPRESSED)
2680                         printf("Flags: COMPRESSED\n");
2681
2682                 if (p == le64toh(f->header->tail_object_offset))
2683                         p = 0;
2684                 else
2685                         p = p + ALIGN64(le64toh(o->object.size));
2686         }
2687
2688         return;
2689 fail:
2690         log_error("File corrupt");
2691 }
2692
2693 void journal_file_print_header(JournalFile *f) {
2694         char a[33], b[33], c[33];
2695         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2696
2697         assert(f);
2698
2699         printf("File Path: %s\n"
2700                "File ID: %s\n"
2701                "Machine ID: %s\n"
2702                "Boot ID: %s\n"
2703                "Sequential Number ID: %s\n"
2704                "State: %s\n"
2705                "Compatible Flags:%s%s\n"
2706                "Incompatible Flags:%s%s\n"
2707                "Header size: %llu\n"
2708                "Arena size: %llu\n"
2709                "Data Hash Table Size: %llu\n"
2710                "Field Hash Table Size: %llu\n"
2711                "Objects: %llu\n"
2712                "Entry Objects: %llu\n"
2713                "Rotate Suggested: %s\n"
2714                "Head Sequential Number: %llu\n"
2715                "Tail Sequential Number: %llu\n"
2716                "Head Realtime Timestamp: %s\n"
2717                "Tail Realtime Timestamp: %s\n",
2718                f->path,
2719                sd_id128_to_string(f->header->file_id, a),
2720                sd_id128_to_string(f->header->machine_id, b),
2721                sd_id128_to_string(f->header->boot_id, c),
2722                sd_id128_to_string(f->header->seqnum_id, c),
2723                f->header->state == STATE_OFFLINE ? "offline" :
2724                f->header->state == STATE_ONLINE ? "online" :
2725                f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
2726                (f->header->compatible_flags & HEADER_COMPATIBLE_AUTHENTICATED) ? " AUTHENTICATED" : "",
2727                (f->header->compatible_flags & ~HEADER_COMPATIBLE_AUTHENTICATED) ? " ???" : "",
2728                (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
2729                (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2730                (unsigned long long) le64toh(f->header->header_size),
2731                (unsigned long long) le64toh(f->header->arena_size),
2732                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2733                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2734                (unsigned long long) le64toh(f->header->n_objects),
2735                (unsigned long long) le64toh(f->header->n_entries),
2736                yes_no(journal_file_rotate_suggested(f)),
2737                (unsigned long long) le64toh(f->header->head_entry_seqnum),
2738                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2739                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2740                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
2741
2742         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2743                 printf("Data Objects: %llu\n"
2744                        "Data Hash Table Fill: %.1f%%\n",
2745                        (unsigned long long) le64toh(f->header->n_data),
2746                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2747
2748         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2749                 printf("Field Objects: %llu\n"
2750                        "Field Hash Table Fill: %.1f%%\n",
2751                        (unsigned long long) le64toh(f->header->n_fields),
2752                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2753 }
2754
2755 int journal_file_open(
2756                 const char *fname,
2757                 int flags,
2758                 mode_t mode,
2759                 bool compress,
2760                 bool authenticate,
2761                 JournalMetrics *metrics,
2762                 MMapCache *mmap_cache,
2763                 JournalFile *template,
2764                 JournalFile **ret) {
2765
2766         JournalFile *f;
2767         int r;
2768         bool newly_created = false;
2769
2770         assert(fname);
2771
2772         if ((flags & O_ACCMODE) != O_RDONLY &&
2773             (flags & O_ACCMODE) != O_RDWR)
2774                 return -EINVAL;
2775
2776         if (!endswith(fname, ".journal"))
2777                 return -EINVAL;
2778
2779         f = new0(JournalFile, 1);
2780         if (!f)
2781                 return -ENOMEM;
2782
2783         f->fd = -1;
2784         f->mode = mode;
2785
2786         f->flags = flags;
2787         f->prot = prot_from_flags(flags);
2788         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2789         f->compress = compress;
2790         f->authenticate = authenticate;
2791
2792         if (mmap_cache)
2793                 f->mmap = mmap_cache_ref(mmap_cache);
2794         else {
2795                 /* One context for each type, plus the zeroth catchall
2796                  * context. One fd for the file plus one for each type
2797                  * (which we need during verification */
2798                 f->mmap = mmap_cache_new(_OBJECT_TYPE_MAX, 1 + _OBJECT_TYPE_MAX);
2799                 if (!f->mmap) {
2800                         r = -ENOMEM;
2801                         goto fail;
2802                 }
2803         }
2804
2805         f->path = strdup(fname);
2806         if (!f->path) {
2807                 r = -ENOMEM;
2808                 goto fail;
2809         }
2810
2811         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2812         if (f->fd < 0) {
2813                 r = -errno;
2814                 goto fail;
2815         }
2816
2817         if (fstat(f->fd, &f->last_stat) < 0) {
2818                 r = -errno;
2819                 goto fail;
2820         }
2821
2822         if (f->last_stat.st_size == 0 && f->writable) {
2823                 newly_created = true;
2824
2825                 /* Try to load the FSPRG state, and if we can't, then
2826                  * just don't do authentication */
2827                 r = journal_file_load_fsprg(f);
2828                 if (r < 0)
2829                         f->authenticate = false;
2830
2831                 r = journal_file_init_header(f, template);
2832                 if (r < 0)
2833                         goto fail;
2834
2835                 if (fstat(f->fd, &f->last_stat) < 0) {
2836                         r = -errno;
2837                         goto fail;
2838                 }
2839         }
2840
2841         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2842                 r = -EIO;
2843                 goto fail;
2844         }
2845
2846         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2847         if (f->header == MAP_FAILED) {
2848                 f->header = NULL;
2849                 r = -errno;
2850                 goto fail;
2851         }
2852
2853         if (!newly_created) {
2854                 r = journal_file_verify_header(f);
2855                 if (r < 0)
2856                         goto fail;
2857         }
2858
2859         if (!newly_created && f->writable) {
2860                 r = journal_file_load_fsprg(f);
2861                 if (r < 0)
2862                         goto fail;
2863         }
2864
2865         if (f->writable) {
2866                 if (metrics) {
2867                         journal_default_metrics(metrics, f->fd);
2868                         f->metrics = *metrics;
2869                 } else if (template)
2870                         f->metrics = template->metrics;
2871
2872                 r = journal_file_refresh_header(f);
2873                 if (r < 0)
2874                         goto fail;
2875
2876                 r = journal_file_setup_hmac(f);
2877                 if (r < 0)
2878                         goto fail;
2879         }
2880
2881         if (newly_created) {
2882                 r = journal_file_setup_field_hash_table(f);
2883                 if (r < 0)
2884                         goto fail;
2885
2886                 r = journal_file_setup_data_hash_table(f);
2887                 if (r < 0)
2888                         goto fail;
2889
2890                 r = journal_file_append_first_tag(f);
2891                 if (r < 0)
2892                         goto fail;
2893         }
2894
2895         r = journal_file_map_field_hash_table(f);
2896         if (r < 0)
2897                 goto fail;
2898
2899         r = journal_file_map_data_hash_table(f);
2900         if (r < 0)
2901                 goto fail;
2902
2903         if (ret)
2904                 *ret = f;
2905
2906         return 0;
2907
2908 fail:
2909         journal_file_close(f);
2910
2911         return r;
2912 }
2913
2914 int journal_file_rotate(JournalFile **f, bool compress, bool authenticate) {
2915         char *p;
2916         size_t l;
2917         JournalFile *old_file, *new_file = NULL;
2918         int r;
2919
2920         assert(f);
2921         assert(*f);
2922
2923         old_file = *f;
2924
2925         if (!old_file->writable)
2926                 return -EINVAL;
2927
2928         if (!endswith(old_file->path, ".journal"))
2929                 return -EINVAL;
2930
2931         l = strlen(old_file->path);
2932
2933         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2934         if (!p)
2935                 return -ENOMEM;
2936
2937         memcpy(p, old_file->path, l - 8);
2938         p[l-8] = '@';
2939         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2940         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2941                  "-%016llx-%016llx.journal",
2942                  (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
2943                  (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2944
2945         r = rename(old_file->path, p);
2946         free(p);
2947
2948         if (r < 0)
2949                 return -errno;
2950
2951         old_file->header->state = STATE_ARCHIVED;
2952
2953         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, authenticate, NULL, old_file->mmap, old_file, &new_file);
2954         journal_file_close(old_file);
2955
2956         *f = new_file;
2957         return r;
2958 }
2959
2960 int journal_file_open_reliably(
2961                 const char *fname,
2962                 int flags,
2963                 mode_t mode,
2964                 bool compress,
2965                 bool authenticate,
2966                 JournalMetrics *metrics,
2967                 MMapCache *mmap,
2968                 JournalFile *template,
2969                 JournalFile **ret) {
2970
2971         int r;
2972         size_t l;
2973         char *p;
2974
2975         r = journal_file_open(fname, flags, mode, compress, authenticate, metrics, mmap, template, ret);
2976         if (r != -EBADMSG && /* corrupted */
2977             r != -ENODATA && /* truncated */
2978             r != -EHOSTDOWN && /* other machine */
2979             r != -EPROTONOSUPPORT && /* incompatible feature */
2980             r != -EBUSY && /* unclean shutdown */
2981             r != -ESHUTDOWN /* already archived */)
2982                 return r;
2983
2984         if ((flags & O_ACCMODE) == O_RDONLY)
2985                 return r;
2986
2987         if (!(flags & O_CREAT))
2988                 return r;
2989
2990         if (!endswith(fname, ".journal"))
2991                 return r;
2992
2993         /* The file is corrupted. Rotate it away and try it again (but only once) */
2994
2995         l = strlen(fname);
2996         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2997                      (int) (l-8), fname,
2998                      (unsigned long long) now(CLOCK_REALTIME),
2999                      random_ull()) < 0)
3000                 return -ENOMEM;
3001
3002         r = rename(fname, p);
3003         free(p);
3004         if (r < 0)
3005                 return -errno;
3006
3007         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
3008
3009         return journal_file_open(fname, flags, mode, compress, authenticate, metrics, mmap, template, ret);
3010 }
3011
3012 struct vacuum_info {
3013         off_t usage;
3014         char *filename;
3015
3016         uint64_t realtime;
3017         sd_id128_t seqnum_id;
3018         uint64_t seqnum;
3019
3020         bool have_seqnum;
3021 };
3022
3023 static int vacuum_compare(const void *_a, const void *_b) {
3024         const struct vacuum_info *a, *b;
3025
3026         a = _a;
3027         b = _b;
3028
3029         if (a->have_seqnum && b->have_seqnum &&
3030             sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
3031                 if (a->seqnum < b->seqnum)
3032                         return -1;
3033                 else if (a->seqnum > b->seqnum)
3034                         return 1;
3035                 else
3036                         return 0;
3037         }
3038
3039         if (a->realtime < b->realtime)
3040                 return -1;
3041         else if (a->realtime > b->realtime)
3042                 return 1;
3043         else if (a->have_seqnum && b->have_seqnum)
3044                 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
3045         else
3046                 return strcmp(a->filename, b->filename);
3047 }
3048
3049 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
3050         DIR *d;
3051         int r = 0;
3052         struct vacuum_info *list = NULL;
3053         unsigned n_list = 0, n_allocated = 0, i;
3054         uint64_t sum = 0;
3055
3056         assert(directory);
3057
3058         if (max_use <= 0)
3059                 return 0;
3060
3061         d = opendir(directory);
3062         if (!d)
3063                 return -errno;
3064
3065         for (;;) {
3066                 int k;
3067                 struct dirent buf, *de;
3068                 size_t q;
3069                 struct stat st;
3070                 char *p;
3071                 unsigned long long seqnum = 0, realtime;
3072                 sd_id128_t seqnum_id;
3073                 bool have_seqnum;
3074
3075                 k = readdir_r(d, &buf, &de);
3076                 if (k != 0) {
3077                         r = -k;
3078                         goto finish;
3079                 }
3080
3081                 if (!de)
3082                         break;
3083
3084                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
3085                         continue;
3086
3087                 if (!S_ISREG(st.st_mode))
3088                         continue;
3089
3090                 q = strlen(de->d_name);
3091
3092                 if (endswith(de->d_name, ".journal")) {
3093
3094                         /* Vacuum archived files */
3095
3096                         if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
3097                                 continue;
3098
3099                         if (de->d_name[q-8-16-1] != '-' ||
3100                             de->d_name[q-8-16-1-16-1] != '-' ||
3101                             de->d_name[q-8-16-1-16-1-32-1] != '@')
3102                                 continue;
3103
3104                         p = strdup(de->d_name);
3105                         if (!p) {
3106                                 r = -ENOMEM;
3107                                 goto finish;
3108                         }
3109
3110                         de->d_name[q-8-16-1-16-1] = 0;
3111                         if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
3112                                 free(p);
3113                                 continue;
3114                         }
3115
3116                         if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
3117                                 free(p);
3118                                 continue;
3119                         }
3120
3121                         have_seqnum = true;
3122
3123                 } else if (endswith(de->d_name, ".journal~")) {
3124                         unsigned long long tmp;
3125
3126                         /* Vacuum corrupted files */
3127
3128                         if (q < 1 + 16 + 1 + 16 + 8 + 1)
3129                                 continue;
3130
3131                         if (de->d_name[q-1-8-16-1] != '-' ||
3132                             de->d_name[q-1-8-16-1-16-1] != '@')
3133                                 continue;
3134
3135                         p = strdup(de->d_name);
3136                         if (!p) {
3137                                 r = -ENOMEM;
3138                                 goto finish;
3139                         }
3140
3141                         if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
3142                                 free(p);
3143                                 continue;
3144                         }
3145
3146                         have_seqnum = false;
3147                 } else
3148                         continue;
3149
3150                 if (n_list >= n_allocated) {
3151                         struct vacuum_info *j;
3152
3153                         n_allocated = MAX(n_allocated * 2U, 8U);
3154                         j = realloc(list, n_allocated * sizeof(struct vacuum_info));
3155                         if (!j) {
3156                                 free(p);
3157                                 r = -ENOMEM;
3158                                 goto finish;
3159                         }
3160
3161                         list = j;
3162                 }
3163
3164                 list[n_list].filename = p;
3165                 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
3166                 list[n_list].seqnum = seqnum;
3167                 list[n_list].realtime = realtime;
3168                 list[n_list].seqnum_id = seqnum_id;
3169                 list[n_list].have_seqnum = have_seqnum;
3170
3171                 sum += list[n_list].usage;
3172
3173                 n_list ++;
3174         }
3175
3176         if (n_list > 0)
3177                 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
3178
3179         for(i = 0; i < n_list; i++) {
3180                 struct statvfs ss;
3181
3182                 if (fstatvfs(dirfd(d), &ss) < 0) {
3183                         r = -errno;
3184                         goto finish;
3185                 }
3186
3187                 if (sum <= max_use &&
3188                     (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
3189                         break;
3190
3191                 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
3192                         log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
3193                         sum -= list[i].usage;
3194                 } else if (errno != ENOENT)
3195                         log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
3196         }
3197
3198 finish:
3199         for (i = 0; i < n_list; i++)
3200                 free(list[i].filename);
3201
3202         free(list);
3203
3204         if (d)
3205                 closedir(d);
3206
3207         return r;
3208 }
3209
3210 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3211         uint64_t i, n;
3212         uint64_t q, xor_hash = 0;
3213         int r;
3214         EntryItem *items;
3215         dual_timestamp ts;
3216
3217         assert(from);
3218         assert(to);
3219         assert(o);
3220         assert(p);
3221
3222         if (!to->writable)
3223                 return -EPERM;
3224
3225         ts.monotonic = le64toh(o->entry.monotonic);
3226         ts.realtime = le64toh(o->entry.realtime);
3227
3228         if (to->tail_entry_monotonic_valid &&
3229             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
3230                 return -EINVAL;
3231
3232         n = journal_file_entry_n_items(o);
3233         items = alloca(sizeof(EntryItem) * n);
3234
3235         for (i = 0; i < n; i++) {
3236                 uint64_t l, h;
3237                 le64_t le_hash;
3238                 size_t t;
3239                 void *data;
3240                 Object *u;
3241
3242                 q = le64toh(o->entry.items[i].object_offset);
3243                 le_hash = o->entry.items[i].hash;
3244
3245                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3246                 if (r < 0)
3247                         return r;
3248
3249                 if (le_hash != o->data.hash)
3250                         return -EBADMSG;
3251
3252                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3253                 t = (size_t) l;
3254
3255                 /* We hit the limit on 32bit machines */
3256                 if ((uint64_t) t != l)
3257                         return -E2BIG;
3258
3259                 if (o->object.flags & OBJECT_COMPRESSED) {
3260 #ifdef HAVE_XZ
3261                         uint64_t rsize;
3262
3263                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
3264                                 return -EBADMSG;
3265
3266                         data = from->compress_buffer;
3267                         l = rsize;
3268 #else
3269                         return -EPROTONOSUPPORT;
3270 #endif
3271                 } else
3272                         data = o->data.payload;
3273
3274                 r = journal_file_append_data(to, data, l, &u, &h);
3275                 if (r < 0)
3276                         return r;
3277
3278                 xor_hash ^= le64toh(u->data.hash);
3279                 items[i].object_offset = htole64(h);
3280                 items[i].hash = u->data.hash;
3281
3282                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3283                 if (r < 0)
3284                         return r;
3285         }
3286
3287         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3288 }
3289
3290 void journal_default_metrics(JournalMetrics *m, int fd) {
3291         uint64_t fs_size = 0;
3292         struct statvfs ss;
3293         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
3294
3295         assert(m);
3296         assert(fd >= 0);
3297
3298         if (fstatvfs(fd, &ss) >= 0)
3299                 fs_size = ss.f_frsize * ss.f_blocks;
3300
3301         if (m->max_use == (uint64_t) -1) {
3302
3303                 if (fs_size > 0) {
3304                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3305
3306                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
3307                                 m->max_use = DEFAULT_MAX_USE_UPPER;
3308
3309                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
3310                                 m->max_use = DEFAULT_MAX_USE_LOWER;
3311                 } else
3312                         m->max_use = DEFAULT_MAX_USE_LOWER;
3313         } else {
3314                 m->max_use = PAGE_ALIGN(m->max_use);
3315
3316                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3317                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3318         }
3319
3320         if (m->max_size == (uint64_t) -1) {
3321                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3322
3323                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3324                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
3325         } else
3326                 m->max_size = PAGE_ALIGN(m->max_size);
3327
3328         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3329                 m->max_size = JOURNAL_FILE_SIZE_MIN;
3330
3331         if (m->max_size*2 > m->max_use)
3332                 m->max_use = m->max_size*2;
3333
3334         if (m->min_size == (uint64_t) -1)
3335                 m->min_size = JOURNAL_FILE_SIZE_MIN;
3336         else {
3337                 m->min_size = PAGE_ALIGN(m->min_size);
3338
3339                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3340                         m->min_size = JOURNAL_FILE_SIZE_MIN;
3341
3342                 if (m->min_size > m->max_size)
3343                         m->max_size = m->min_size;
3344         }
3345
3346         if (m->keep_free == (uint64_t) -1) {
3347
3348                 if (fs_size > 0) {
3349                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
3350
3351                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3352                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3353
3354                 } else
3355                         m->keep_free = DEFAULT_KEEP_FREE;
3356         }
3357
3358         log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
3359                  format_bytes(a, sizeof(a), m->max_use),
3360                  format_bytes(b, sizeof(b), m->max_size),
3361                  format_bytes(c, sizeof(c), m->min_size),
3362                  format_bytes(d, sizeof(d), m->keep_free));
3363 }
3364
3365 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3366         assert(f);
3367         assert(from || to);
3368
3369         if (from) {
3370                 if (f->header->head_entry_realtime == 0)
3371                         return -ENOENT;
3372
3373                 *from = le64toh(f->header->head_entry_realtime);
3374         }
3375
3376         if (to) {
3377                 if (f->header->tail_entry_realtime == 0)
3378                         return -ENOENT;
3379
3380                 *to = le64toh(f->header->tail_entry_realtime);
3381         }
3382
3383         return 1;
3384 }
3385
3386 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3387         char t[9+32+1] = "_BOOT_ID=";
3388         Object *o;
3389         uint64_t p;
3390         int r;
3391
3392         assert(f);
3393         assert(from || to);
3394
3395         sd_id128_to_string(boot_id, t + 9);
3396
3397         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
3398         if (r <= 0)
3399                 return r;
3400
3401         if (le64toh(o->data.n_entries) <= 0)
3402                 return 0;
3403
3404         if (from) {
3405                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3406                 if (r < 0)
3407                         return r;
3408
3409                 *from = le64toh(o->entry.monotonic);
3410         }
3411
3412         if (to) {
3413                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3414                 if (r < 0)
3415                         return r;
3416
3417                 r = generic_array_get_plus_one(f,
3418                                                le64toh(o->data.entry_offset),
3419                                                le64toh(o->data.entry_array_offset),
3420                                                le64toh(o->data.n_entries)-1,
3421                                                &o, NULL);
3422                 if (r <= 0)
3423                         return r;
3424
3425                 *to = le64toh(o->entry.monotonic);
3426         }
3427
3428         return 1;
3429 }
3430
3431 bool journal_file_rotate_suggested(JournalFile *f) {
3432         assert(f);
3433
3434         /* If we gained new header fields we gained new features,
3435          * hence suggest a rotation */
3436         if (le64toh(f->header->header_size) < sizeof(Header)) {
3437                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3438                 return true;
3439         }
3440
3441         /* Let's check if the hash tables grew over a certain fill
3442          * level (75%, borrowing this value from Java's hash table
3443          * implementation), and if so suggest a rotation. To calculate
3444          * the fill level we need the n_data field, which only exists
3445          * in newer versions. */
3446
3447         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3448                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3449                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
3450                                   f->path,
3451                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3452                                   (unsigned long long) le64toh(f->header->n_data),
3453                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
3454                                   (unsigned long long) (f->last_stat.st_size),
3455                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
3456                         return true;
3457                 }
3458
3459         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3460                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3461                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
3462                                   f->path,
3463                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3464                                   (unsigned long long) le64toh(f->header->n_fields),
3465                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
3466                         return true;
3467                 }
3468
3469         return false;
3470 }