chiark / gitweb /
edf8e7dd5e3dcd0dc23d4e9bbfd697484734a74e
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 void journal_file_close(JournalFile *f) {
69         assert(f);
70
71 #ifdef HAVE_GCRYPT
72         /* Write the final tag */
73         if (f->seal && f->writable)
74                 journal_file_append_tag(f);
75 #endif
76
77         /* Sync everything to disk, before we mark the file offline */
78         if (f->mmap && f->fd >= 0)
79                 mmap_cache_close_fd(f->mmap, f->fd);
80
81         if (f->writable && f->fd >= 0)
82                 fdatasync(f->fd);
83
84         if (f->header) {
85                 /* Mark the file offline. Don't override the archived state if it already is set */
86                 if (f->writable && f->header->state == STATE_ONLINE)
87                         f->header->state = STATE_OFFLINE;
88
89                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
90         }
91
92         if (f->fd >= 0)
93                 close_nointr_nofail(f->fd);
94
95         free(f->path);
96
97         if (f->mmap)
98                 mmap_cache_unref(f->mmap);
99
100 #ifdef HAVE_XZ
101         free(f->compress_buffer);
102 #endif
103
104 #ifdef HAVE_GCRYPT
105         if (f->fss_file)
106                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
107         else if (f->fsprg_state)
108                 free(f->fsprg_state);
109
110         free(f->fsprg_seed);
111
112         if (f->hmac)
113                 gcry_md_close(f->hmac);
114 #endif
115
116         free(f);
117 }
118
119 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
120         Header h;
121         ssize_t k;
122         int r;
123
124         assert(f);
125
126         zero(h);
127         memcpy(h.signature, HEADER_SIGNATURE, 8);
128         h.header_size = htole64(ALIGN64(sizeof(h)));
129
130         h.incompatible_flags =
131                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
132
133         h.compatible_flags =
134                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
135
136         r = sd_id128_randomize(&h.file_id);
137         if (r < 0)
138                 return r;
139
140         if (template) {
141                 h.seqnum_id = template->header->seqnum_id;
142                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
143         } else
144                 h.seqnum_id = h.file_id;
145
146         k = pwrite(f->fd, &h, sizeof(h), 0);
147         if (k < 0)
148                 return -errno;
149
150         if (k != sizeof(h))
151                 return -EIO;
152
153         return 0;
154 }
155
156 static int journal_file_refresh_header(JournalFile *f) {
157         int r;
158         sd_id128_t boot_id;
159
160         assert(f);
161
162         r = sd_id128_get_machine(&f->header->machine_id);
163         if (r < 0)
164                 return r;
165
166         r = sd_id128_get_boot(&boot_id);
167         if (r < 0)
168                 return r;
169
170         if (sd_id128_equal(boot_id, f->header->boot_id))
171                 f->tail_entry_monotonic_valid = true;
172
173         f->header->boot_id = boot_id;
174
175         f->header->state = STATE_ONLINE;
176
177         /* Sync the online state to disk */
178         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
179         fdatasync(f->fd);
180
181         return 0;
182 }
183
184 static int journal_file_verify_header(JournalFile *f) {
185         assert(f);
186
187         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
188                 return -EBADMSG;
189
190         /* In both read and write mode we refuse to open files with
191          * incompatible flags we don't know */
192 #ifdef HAVE_XZ
193         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
194                 return -EPROTONOSUPPORT;
195 #else
196         if (f->header->incompatible_flags != 0)
197                 return -EPROTONOSUPPORT;
198 #endif
199
200         /* When open for writing we refuse to open files with
201          * compatible flags, too */
202         if (f->writable) {
203 #ifdef HAVE_GCRYPT
204                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
205                         return -EPROTONOSUPPORT;
206 #else
207                 if (f->header->compatible_flags != 0)
208                         return -EPROTONOSUPPORT;
209 #endif
210         }
211
212         if (f->header->state >= _STATE_MAX)
213                 return -EBADMSG;
214
215         /* The first addition was n_data, so check that we are at least this large */
216         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
217                 return -EBADMSG;
218
219         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
220                 return -EBADMSG;
221
222         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
223                 return -ENODATA;
224
225         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
226                 return -ENODATA;
227
228         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
229             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
230             !VALID64(le64toh(f->header->tail_object_offset)) ||
231             !VALID64(le64toh(f->header->entry_array_offset)))
232                 return -ENODATA;
233
234         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
235             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
236             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
237             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
238                 return -ENODATA;
239
240         if (f->writable) {
241                 uint8_t state;
242                 sd_id128_t machine_id;
243                 int r;
244
245                 r = sd_id128_get_machine(&machine_id);
246                 if (r < 0)
247                         return r;
248
249                 if (!sd_id128_equal(machine_id, f->header->machine_id))
250                         return -EHOSTDOWN;
251
252                 state = f->header->state;
253
254                 if (state == STATE_ONLINE) {
255                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
256                         return -EBUSY;
257                 } else if (state == STATE_ARCHIVED)
258                         return -ESHUTDOWN;
259                 else if (state != STATE_OFFLINE) {
260                         log_debug("Journal file %s has unknown state %u.", f->path, state);
261                         return -EBUSY;
262                 }
263         }
264
265         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
266
267         f->seal = JOURNAL_HEADER_SEALED(f->header);
268
269         return 0;
270 }
271
272 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
273         uint64_t old_size, new_size;
274         int r;
275
276         assert(f);
277
278         /* We assume that this file is not sparse, and we know that
279          * for sure, since we always call posix_fallocate()
280          * ourselves */
281
282         old_size =
283                 le64toh(f->header->header_size) +
284                 le64toh(f->header->arena_size);
285
286         new_size = PAGE_ALIGN(offset + size);
287         if (new_size < le64toh(f->header->header_size))
288                 new_size = le64toh(f->header->header_size);
289
290         if (new_size <= old_size)
291                 return 0;
292
293         if (f->metrics.max_size > 0 &&
294             new_size > f->metrics.max_size)
295                 return -E2BIG;
296
297         if (new_size > f->metrics.min_size &&
298             f->metrics.keep_free > 0) {
299                 struct statvfs svfs;
300
301                 if (fstatvfs(f->fd, &svfs) >= 0) {
302                         uint64_t available;
303
304                         available = svfs.f_bfree * svfs.f_bsize;
305
306                         if (available >= f->metrics.keep_free)
307                                 available -= f->metrics.keep_free;
308                         else
309                                 available = 0;
310
311                         if (new_size - old_size > available)
312                                 return -E2BIG;
313                 }
314         }
315
316         /* Note that the glibc fallocate() fallback is very
317            inefficient, hence we try to minimize the allocation area
318            as we can. */
319         r = posix_fallocate(f->fd, old_size, new_size - old_size);
320         if (r != 0)
321                 return -r;
322
323         if (fstat(f->fd, &f->last_stat) < 0)
324                 return -errno;
325
326         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
327
328         return 0;
329 }
330
331 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
332         assert(f);
333         assert(ret);
334
335         if (size <= 0)
336                 return -EINVAL;
337
338         /* Avoid SIGBUS on invalid accesses */
339         if (offset + size > (uint64_t) f->last_stat.st_size) {
340                 /* Hmm, out of range? Let's refresh the fstat() data
341                  * first, before we trust that check. */
342
343                 if (fstat(f->fd, &f->last_stat) < 0 ||
344                     offset + size > (uint64_t) f->last_stat.st_size)
345                         return -EADDRNOTAVAIL;
346         }
347
348         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
349 }
350
351 static uint64_t minimum_header_size(Object *o) {
352
353         static uint64_t table[] = {
354                 [OBJECT_DATA] = sizeof(DataObject),
355                 [OBJECT_FIELD] = sizeof(FieldObject),
356                 [OBJECT_ENTRY] = sizeof(EntryObject),
357                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
358                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
359                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
360                 [OBJECT_TAG] = sizeof(TagObject),
361         };
362
363         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
364                 return sizeof(ObjectHeader);
365
366         return table[o->object.type];
367 }
368
369 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
370         int r;
371         void *t;
372         Object *o;
373         uint64_t s;
374         unsigned context;
375
376         assert(f);
377         assert(ret);
378
379         /* Objects may only be located at multiple of 64 bit */
380         if (!VALID64(offset))
381                 return -EFAULT;
382
383         /* One context for each type, plus one catch-all for the rest */
384         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
385
386         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
387         if (r < 0)
388                 return r;
389
390         o = (Object*) t;
391         s = le64toh(o->object.size);
392
393         if (s < sizeof(ObjectHeader))
394                 return -EBADMSG;
395
396         if (o->object.type <= OBJECT_UNUSED)
397                 return -EBADMSG;
398
399         if (s < minimum_header_size(o))
400                 return -EBADMSG;
401
402         if (type > 0 && o->object.type != type)
403                 return -EBADMSG;
404
405         if (s > sizeof(ObjectHeader)) {
406                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
407                 if (r < 0)
408                         return r;
409
410                 o = (Object*) t;
411         }
412
413         *ret = o;
414         return 0;
415 }
416
417 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
418         uint64_t r;
419
420         assert(f);
421
422         r = le64toh(f->header->tail_entry_seqnum) + 1;
423
424         if (seqnum) {
425                 /* If an external seqnum counter was passed, we update
426                  * both the local and the external one, and set it to
427                  * the maximum of both */
428
429                 if (*seqnum + 1 > r)
430                         r = *seqnum + 1;
431
432                 *seqnum = r;
433         }
434
435         f->header->tail_entry_seqnum = htole64(r);
436
437         if (f->header->head_entry_seqnum == 0)
438                 f->header->head_entry_seqnum = htole64(r);
439
440         return r;
441 }
442
443 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
444         int r;
445         uint64_t p;
446         Object *tail, *o;
447         void *t;
448
449         assert(f);
450         assert(type > 0 && type < _OBJECT_TYPE_MAX);
451         assert(size >= sizeof(ObjectHeader));
452         assert(offset);
453         assert(ret);
454
455         p = le64toh(f->header->tail_object_offset);
456         if (p == 0)
457                 p = le64toh(f->header->header_size);
458         else {
459                 r = journal_file_move_to_object(f, -1, p, &tail);
460                 if (r < 0)
461                         return r;
462
463                 p += ALIGN64(le64toh(tail->object.size));
464         }
465
466         r = journal_file_allocate(f, p, size);
467         if (r < 0)
468                 return r;
469
470         r = journal_file_move_to(f, type, false, p, size, &t);
471         if (r < 0)
472                 return r;
473
474         o = (Object*) t;
475
476         zero(o->object);
477         o->object.type = type;
478         o->object.size = htole64(size);
479
480         f->header->tail_object_offset = htole64(p);
481         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
482
483         *ret = o;
484         *offset = p;
485
486         return 0;
487 }
488
489 static int journal_file_setup_data_hash_table(JournalFile *f) {
490         uint64_t s, p;
491         Object *o;
492         int r;
493
494         assert(f);
495
496         /* We estimate that we need 1 hash table entry per 768 of
497            journal file and we want to make sure we never get beyond
498            75% fill level. Calculate the hash table size for the
499            maximum file size based on these metrics. */
500
501         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
502         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
503                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
504
505         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
506
507         r = journal_file_append_object(f,
508                                        OBJECT_DATA_HASH_TABLE,
509                                        offsetof(Object, hash_table.items) + s,
510                                        &o, &p);
511         if (r < 0)
512                 return r;
513
514         memset(o->hash_table.items, 0, s);
515
516         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
517         f->header->data_hash_table_size = htole64(s);
518
519         return 0;
520 }
521
522 static int journal_file_setup_field_hash_table(JournalFile *f) {
523         uint64_t s, p;
524         Object *o;
525         int r;
526
527         assert(f);
528
529         /* We use a fixed size hash table for the fields as this
530          * number should grow very slowly only */
531
532         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
533         r = journal_file_append_object(f,
534                                        OBJECT_FIELD_HASH_TABLE,
535                                        offsetof(Object, hash_table.items) + s,
536                                        &o, &p);
537         if (r < 0)
538                 return r;
539
540         memset(o->hash_table.items, 0, s);
541
542         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
543         f->header->field_hash_table_size = htole64(s);
544
545         return 0;
546 }
547
548 static int journal_file_map_data_hash_table(JournalFile *f) {
549         uint64_t s, p;
550         void *t;
551         int r;
552
553         assert(f);
554
555         p = le64toh(f->header->data_hash_table_offset);
556         s = le64toh(f->header->data_hash_table_size);
557
558         r = journal_file_move_to(f,
559                                  OBJECT_DATA_HASH_TABLE,
560                                  true,
561                                  p, s,
562                                  &t);
563         if (r < 0)
564                 return r;
565
566         f->data_hash_table = t;
567         return 0;
568 }
569
570 static int journal_file_map_field_hash_table(JournalFile *f) {
571         uint64_t s, p;
572         void *t;
573         int r;
574
575         assert(f);
576
577         p = le64toh(f->header->field_hash_table_offset);
578         s = le64toh(f->header->field_hash_table_size);
579
580         r = journal_file_move_to(f,
581                                  OBJECT_FIELD_HASH_TABLE,
582                                  true,
583                                  p, s,
584                                  &t);
585         if (r < 0)
586                 return r;
587
588         f->field_hash_table = t;
589         return 0;
590 }
591
592 static int journal_file_link_field(
593                 JournalFile *f,
594                 Object *o,
595                 uint64_t offset,
596                 uint64_t hash) {
597
598         uint64_t p, h;
599         int r;
600
601         assert(f);
602         assert(o);
603         assert(offset > 0);
604
605         if (o->object.type != OBJECT_FIELD)
606                 return -EINVAL;
607
608         /* This might alter the window we are looking at */
609
610         o->field.next_hash_offset = o->field.head_data_offset = 0;
611
612         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
613         p = le64toh(f->field_hash_table[h].tail_hash_offset);
614         if (p == 0)
615                 f->field_hash_table[h].head_hash_offset = htole64(offset);
616         else {
617                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
618                 if (r < 0)
619                         return r;
620
621                 o->field.next_hash_offset = htole64(offset);
622         }
623
624         f->field_hash_table[h].tail_hash_offset = htole64(offset);
625
626         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
627                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
628
629         return 0;
630 }
631
632 static int journal_file_link_data(
633                 JournalFile *f,
634                 Object *o,
635                 uint64_t offset,
636                 uint64_t hash) {
637
638         uint64_t p, h;
639         int r;
640
641         assert(f);
642         assert(o);
643         assert(offset > 0);
644
645         if (o->object.type != OBJECT_DATA)
646                 return -EINVAL;
647
648         /* This might alter the window we are looking at */
649
650         o->data.next_hash_offset = o->data.next_field_offset = 0;
651         o->data.entry_offset = o->data.entry_array_offset = 0;
652         o->data.n_entries = 0;
653
654         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
655         p = le64toh(f->data_hash_table[h].tail_hash_offset);
656         if (p == 0)
657                 /* Only entry in the hash table is easy */
658                 f->data_hash_table[h].head_hash_offset = htole64(offset);
659         else {
660                 /* Move back to the previous data object, to patch in
661                  * pointer */
662
663                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
664                 if (r < 0)
665                         return r;
666
667                 o->data.next_hash_offset = htole64(offset);
668         }
669
670         f->data_hash_table[h].tail_hash_offset = htole64(offset);
671
672         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
673                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
674
675         return 0;
676 }
677
678 int journal_file_find_field_object_with_hash(
679                 JournalFile *f,
680                 const void *field, uint64_t size, uint64_t hash,
681                 Object **ret, uint64_t *offset) {
682
683         uint64_t p, osize, h;
684         int r;
685
686         assert(f);
687         assert(field && size > 0);
688
689         osize = offsetof(Object, field.payload) + size;
690
691         if (f->header->field_hash_table_size == 0)
692                 return -EBADMSG;
693
694         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
695         p = le64toh(f->field_hash_table[h].head_hash_offset);
696
697         while (p > 0) {
698                 Object *o;
699
700                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
701                 if (r < 0)
702                         return r;
703
704                 if (le64toh(o->field.hash) == hash &&
705                     le64toh(o->object.size) == osize &&
706                     memcmp(o->field.payload, field, size) == 0) {
707
708                         if (ret)
709                                 *ret = o;
710                         if (offset)
711                                 *offset = p;
712
713                         return 1;
714                 }
715
716                 p = le64toh(o->field.next_hash_offset);
717         }
718
719         return 0;
720 }
721
722 int journal_file_find_field_object(
723                 JournalFile *f,
724                 const void *field, uint64_t size,
725                 Object **ret, uint64_t *offset) {
726
727         uint64_t hash;
728
729         assert(f);
730         assert(field && size > 0);
731
732         hash = hash64(field, size);
733
734         return journal_file_find_field_object_with_hash(f,
735                                                         field, size, hash,
736                                                         ret, offset);
737 }
738
739 int journal_file_find_data_object_with_hash(
740                 JournalFile *f,
741                 const void *data, uint64_t size, uint64_t hash,
742                 Object **ret, uint64_t *offset) {
743
744         uint64_t p, osize, h;
745         int r;
746
747         assert(f);
748         assert(data || size == 0);
749
750         osize = offsetof(Object, data.payload) + size;
751
752         if (f->header->data_hash_table_size == 0)
753                 return -EBADMSG;
754
755         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
756         p = le64toh(f->data_hash_table[h].head_hash_offset);
757
758         while (p > 0) {
759                 Object *o;
760
761                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
762                 if (r < 0)
763                         return r;
764
765                 if (le64toh(o->data.hash) != hash)
766                         goto next;
767
768                 if (o->object.flags & OBJECT_COMPRESSED) {
769 #ifdef HAVE_XZ
770                         uint64_t l, rsize;
771
772                         l = le64toh(o->object.size);
773                         if (l <= offsetof(Object, data.payload))
774                                 return -EBADMSG;
775
776                         l -= offsetof(Object, data.payload);
777
778                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
779                                 return -EBADMSG;
780
781                         if (rsize == size &&
782                             memcmp(f->compress_buffer, data, size) == 0) {
783
784                                 if (ret)
785                                         *ret = o;
786
787                                 if (offset)
788                                         *offset = p;
789
790                                 return 1;
791                         }
792 #else
793                         return -EPROTONOSUPPORT;
794 #endif
795
796                 } else if (le64toh(o->object.size) == osize &&
797                            memcmp(o->data.payload, data, size) == 0) {
798
799                         if (ret)
800                                 *ret = o;
801
802                         if (offset)
803                                 *offset = p;
804
805                         return 1;
806                 }
807
808         next:
809                 p = le64toh(o->data.next_hash_offset);
810         }
811
812         return 0;
813 }
814
815 int journal_file_find_data_object(
816                 JournalFile *f,
817                 const void *data, uint64_t size,
818                 Object **ret, uint64_t *offset) {
819
820         uint64_t hash;
821
822         assert(f);
823         assert(data || size == 0);
824
825         hash = hash64(data, size);
826
827         return journal_file_find_data_object_with_hash(f,
828                                                        data, size, hash,
829                                                        ret, offset);
830 }
831
832 static int journal_file_append_field(
833                 JournalFile *f,
834                 const void *field, uint64_t size,
835                 Object **ret, uint64_t *offset) {
836
837         uint64_t hash, p;
838         uint64_t osize;
839         Object *o;
840         int r;
841
842         assert(f);
843         assert(field && size > 0);
844
845         hash = hash64(field, size);
846
847         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
848         if (r < 0)
849                 return r;
850         else if (r > 0) {
851
852                 if (ret)
853                         *ret = o;
854
855                 if (offset)
856                         *offset = p;
857
858                 return 0;
859         }
860
861         osize = offsetof(Object, field.payload) + size;
862         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
863
864         o->field.hash = htole64(hash);
865         memcpy(o->field.payload, field, size);
866
867         r = journal_file_link_field(f, o, p, hash);
868         if (r < 0)
869                 return r;
870
871         /* The linking might have altered the window, so let's
872          * refresh our pointer */
873         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
874         if (r < 0)
875                 return r;
876
877 #ifdef HAVE_GCRYPT
878         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
879         if (r < 0)
880                 return r;
881 #endif
882
883         if (ret)
884                 *ret = o;
885
886         if (offset)
887                 *offset = p;
888
889         return 0;
890 }
891
892 static int journal_file_append_data(
893                 JournalFile *f,
894                 const void *data, uint64_t size,
895                 Object **ret, uint64_t *offset) {
896
897         uint64_t hash, p;
898         uint64_t osize;
899         Object *o;
900         int r;
901         bool compressed = false;
902         const void *eq;
903
904         assert(f);
905         assert(data || size == 0);
906
907         hash = hash64(data, size);
908
909         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
910         if (r < 0)
911                 return r;
912         else if (r > 0) {
913
914                 if (ret)
915                         *ret = o;
916
917                 if (offset)
918                         *offset = p;
919
920                 return 0;
921         }
922
923         osize = offsetof(Object, data.payload) + size;
924         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
925         if (r < 0)
926                 return r;
927
928         o->data.hash = htole64(hash);
929
930 #ifdef HAVE_XZ
931         if (f->compress &&
932             size >= COMPRESSION_SIZE_THRESHOLD) {
933                 uint64_t rsize;
934
935                 compressed = compress_blob(data, size, o->data.payload, &rsize);
936
937                 if (compressed) {
938                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
939                         o->object.flags |= OBJECT_COMPRESSED;
940
941                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
942                 }
943         }
944 #endif
945
946         if (!compressed && size > 0)
947                 memcpy(o->data.payload, data, size);
948
949         r = journal_file_link_data(f, o, p, hash);
950         if (r < 0)
951                 return r;
952
953         /* The linking might have altered the window, so let's
954          * refresh our pointer */
955         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
956         if (r < 0)
957                 return r;
958
959         eq = memchr(data, '=', size);
960         if (eq && eq > data) {
961                 uint64_t fp;
962                 Object *fo;
963
964                 /* Create field object ... */
965                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
966                 if (r < 0)
967                         return r;
968
969                 /* ... and link it in. */
970                 o->data.next_field_offset = fo->field.head_data_offset;
971                 fo->field.head_data_offset = le64toh(p);
972         }
973
974 #ifdef HAVE_GCRYPT
975         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
976         if (r < 0)
977                 return r;
978 #endif
979
980         if (ret)
981                 *ret = o;
982
983         if (offset)
984                 *offset = p;
985
986         return 0;
987 }
988
989 uint64_t journal_file_entry_n_items(Object *o) {
990         assert(o);
991
992         if (o->object.type != OBJECT_ENTRY)
993                 return 0;
994
995         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
996 }
997
998 uint64_t journal_file_entry_array_n_items(Object *o) {
999         assert(o);
1000
1001         if (o->object.type != OBJECT_ENTRY_ARRAY)
1002                 return 0;
1003
1004         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1005 }
1006
1007 uint64_t journal_file_hash_table_n_items(Object *o) {
1008         assert(o);
1009
1010         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1011             o->object.type != OBJECT_FIELD_HASH_TABLE)
1012                 return 0;
1013
1014         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1015 }
1016
1017 static int link_entry_into_array(JournalFile *f,
1018                                  le64_t *first,
1019                                  le64_t *idx,
1020                                  uint64_t p) {
1021         int r;
1022         uint64_t n = 0, ap = 0, q, i, a, hidx;
1023         Object *o;
1024
1025         assert(f);
1026         assert(first);
1027         assert(idx);
1028         assert(p > 0);
1029
1030         a = le64toh(*first);
1031         i = hidx = le64toh(*idx);
1032         while (a > 0) {
1033
1034                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1035                 if (r < 0)
1036                         return r;
1037
1038                 n = journal_file_entry_array_n_items(o);
1039                 if (i < n) {
1040                         o->entry_array.items[i] = htole64(p);
1041                         *idx = htole64(hidx + 1);
1042                         return 0;
1043                 }
1044
1045                 i -= n;
1046                 ap = a;
1047                 a = le64toh(o->entry_array.next_entry_array_offset);
1048         }
1049
1050         if (hidx > n)
1051                 n = (hidx+1) * 2;
1052         else
1053                 n = n * 2;
1054
1055         if (n < 4)
1056                 n = 4;
1057
1058         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1059                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1060                                        &o, &q);
1061         if (r < 0)
1062                 return r;
1063
1064 #ifdef HAVE_GCRYPT
1065         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1066         if (r < 0)
1067                 return r;
1068 #endif
1069
1070         o->entry_array.items[i] = htole64(p);
1071
1072         if (ap == 0)
1073                 *first = htole64(q);
1074         else {
1075                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1076                 if (r < 0)
1077                         return r;
1078
1079                 o->entry_array.next_entry_array_offset = htole64(q);
1080         }
1081
1082         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1083                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1084
1085         *idx = htole64(hidx + 1);
1086
1087         return 0;
1088 }
1089
1090 static int link_entry_into_array_plus_one(JournalFile *f,
1091                                           le64_t *extra,
1092                                           le64_t *first,
1093                                           le64_t *idx,
1094                                           uint64_t p) {
1095
1096         int r;
1097
1098         assert(f);
1099         assert(extra);
1100         assert(first);
1101         assert(idx);
1102         assert(p > 0);
1103
1104         if (*idx == 0)
1105                 *extra = htole64(p);
1106         else {
1107                 le64_t i;
1108
1109                 i = htole64(le64toh(*idx) - 1);
1110                 r = link_entry_into_array(f, first, &i, p);
1111                 if (r < 0)
1112                         return r;
1113         }
1114
1115         *idx = htole64(le64toh(*idx) + 1);
1116         return 0;
1117 }
1118
1119 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1120         uint64_t p;
1121         int r;
1122         assert(f);
1123         assert(o);
1124         assert(offset > 0);
1125
1126         p = le64toh(o->entry.items[i].object_offset);
1127         if (p == 0)
1128                 return -EINVAL;
1129
1130         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1131         if (r < 0)
1132                 return r;
1133
1134         return link_entry_into_array_plus_one(f,
1135                                               &o->data.entry_offset,
1136                                               &o->data.entry_array_offset,
1137                                               &o->data.n_entries,
1138                                               offset);
1139 }
1140
1141 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1142         uint64_t n, i;
1143         int r;
1144
1145         assert(f);
1146         assert(o);
1147         assert(offset > 0);
1148
1149         if (o->object.type != OBJECT_ENTRY)
1150                 return -EINVAL;
1151
1152         __sync_synchronize();
1153
1154         /* Link up the entry itself */
1155         r = link_entry_into_array(f,
1156                                   &f->header->entry_array_offset,
1157                                   &f->header->n_entries,
1158                                   offset);
1159         if (r < 0)
1160                 return r;
1161
1162         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
1163
1164         if (f->header->head_entry_realtime == 0)
1165                 f->header->head_entry_realtime = o->entry.realtime;
1166
1167         f->header->tail_entry_realtime = o->entry.realtime;
1168         f->header->tail_entry_monotonic = o->entry.monotonic;
1169
1170         f->tail_entry_monotonic_valid = true;
1171
1172         /* Link up the items */
1173         n = journal_file_entry_n_items(o);
1174         for (i = 0; i < n; i++) {
1175                 r = journal_file_link_entry_item(f, o, offset, i);
1176                 if (r < 0)
1177                         return r;
1178         }
1179
1180         return 0;
1181 }
1182
1183 static int journal_file_append_entry_internal(
1184                 JournalFile *f,
1185                 const dual_timestamp *ts,
1186                 uint64_t xor_hash,
1187                 const EntryItem items[], unsigned n_items,
1188                 uint64_t *seqnum,
1189                 Object **ret, uint64_t *offset) {
1190         uint64_t np;
1191         uint64_t osize;
1192         Object *o;
1193         int r;
1194
1195         assert(f);
1196         assert(items || n_items == 0);
1197         assert(ts);
1198
1199         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1200
1201         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1202         if (r < 0)
1203                 return r;
1204
1205         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1206         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1207         o->entry.realtime = htole64(ts->realtime);
1208         o->entry.monotonic = htole64(ts->monotonic);
1209         o->entry.xor_hash = htole64(xor_hash);
1210         o->entry.boot_id = f->header->boot_id;
1211
1212 #ifdef HAVE_GCRYPT
1213         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1214         if (r < 0)
1215                 return r;
1216 #endif
1217
1218         r = journal_file_link_entry(f, o, np);
1219         if (r < 0)
1220                 return r;
1221
1222         if (ret)
1223                 *ret = o;
1224
1225         if (offset)
1226                 *offset = np;
1227
1228         return 0;
1229 }
1230
1231 void journal_file_post_change(JournalFile *f) {
1232         assert(f);
1233
1234         /* inotify() does not receive IN_MODIFY events from file
1235          * accesses done via mmap(). After each access we hence
1236          * trigger IN_MODIFY by truncating the journal file to its
1237          * current size which triggers IN_MODIFY. */
1238
1239         __sync_synchronize();
1240
1241         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1242                 log_error("Failed to truncate file to its own size: %m");
1243 }
1244
1245 static int entry_item_cmp(const void *_a, const void *_b) {
1246         const EntryItem *a = _a, *b = _b;
1247
1248         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1249                 return -1;
1250         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1251                 return 1;
1252         return 0;
1253 }
1254
1255 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1256         unsigned i;
1257         EntryItem *items;
1258         int r;
1259         uint64_t xor_hash = 0;
1260         struct dual_timestamp _ts;
1261
1262         assert(f);
1263         assert(iovec || n_iovec == 0);
1264
1265         if (!f->writable)
1266                 return -EPERM;
1267
1268         if (!ts) {
1269                 dual_timestamp_get(&_ts);
1270                 ts = &_ts;
1271         }
1272
1273         if (f->tail_entry_monotonic_valid &&
1274             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1275                 return -EINVAL;
1276
1277 #ifdef HAVE_GCRYPT
1278         r = journal_file_maybe_append_tag(f, ts->realtime);
1279         if (r < 0)
1280                 return r;
1281 #endif
1282
1283         /* alloca() can't take 0, hence let's allocate at least one */
1284         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1285
1286         for (i = 0; i < n_iovec; i++) {
1287                 uint64_t p;
1288                 Object *o;
1289
1290                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1291                 if (r < 0)
1292                         return r;
1293
1294                 xor_hash ^= le64toh(o->data.hash);
1295                 items[i].object_offset = htole64(p);
1296                 items[i].hash = o->data.hash;
1297         }
1298
1299         /* Order by the position on disk, in order to improve seek
1300          * times for rotating media. */
1301         qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1302
1303         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1304
1305         journal_file_post_change(f);
1306
1307         return r;
1308 }
1309
1310 static int generic_array_get(JournalFile *f,
1311                              uint64_t first,
1312                              uint64_t i,
1313                              Object **ret, uint64_t *offset) {
1314
1315         Object *o;
1316         uint64_t p = 0, a;
1317         int r;
1318
1319         assert(f);
1320
1321         a = first;
1322         while (a > 0) {
1323                 uint64_t n;
1324
1325                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1326                 if (r < 0)
1327                         return r;
1328
1329                 n = journal_file_entry_array_n_items(o);
1330                 if (i < n) {
1331                         p = le64toh(o->entry_array.items[i]);
1332                         break;
1333                 }
1334
1335                 i -= n;
1336                 a = le64toh(o->entry_array.next_entry_array_offset);
1337         }
1338
1339         if (a <= 0 || p <= 0)
1340                 return 0;
1341
1342         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1343         if (r < 0)
1344                 return r;
1345
1346         if (ret)
1347                 *ret = o;
1348
1349         if (offset)
1350                 *offset = p;
1351
1352         return 1;
1353 }
1354
1355 static int generic_array_get_plus_one(JournalFile *f,
1356                                       uint64_t extra,
1357                                       uint64_t first,
1358                                       uint64_t i,
1359                                       Object **ret, uint64_t *offset) {
1360
1361         Object *o;
1362
1363         assert(f);
1364
1365         if (i == 0) {
1366                 int r;
1367
1368                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1369                 if (r < 0)
1370                         return r;
1371
1372                 if (ret)
1373                         *ret = o;
1374
1375                 if (offset)
1376                         *offset = extra;
1377
1378                 return 1;
1379         }
1380
1381         return generic_array_get(f, first, i-1, ret, offset);
1382 }
1383
1384 enum {
1385         TEST_FOUND,
1386         TEST_LEFT,
1387         TEST_RIGHT
1388 };
1389
1390 static int generic_array_bisect(JournalFile *f,
1391                                 uint64_t first,
1392                                 uint64_t n,
1393                                 uint64_t needle,
1394                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1395                                 direction_t direction,
1396                                 Object **ret,
1397                                 uint64_t *offset,
1398                                 uint64_t *idx) {
1399
1400         uint64_t a, p, t = 0, i = 0, last_p = 0;
1401         bool subtract_one = false;
1402         Object *o, *array = NULL;
1403         int r;
1404
1405         assert(f);
1406         assert(test_object);
1407
1408         a = first;
1409         while (a > 0) {
1410                 uint64_t left, right, k, lp;
1411
1412                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1413                 if (r < 0)
1414                         return r;
1415
1416                 k = journal_file_entry_array_n_items(array);
1417                 right = MIN(k, n);
1418                 if (right <= 0)
1419                         return 0;
1420
1421                 i = right - 1;
1422                 lp = p = le64toh(array->entry_array.items[i]);
1423                 if (p <= 0)
1424                         return -EBADMSG;
1425
1426                 r = test_object(f, p, needle);
1427                 if (r < 0)
1428                         return r;
1429
1430                 if (r == TEST_FOUND)
1431                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1432
1433                 if (r == TEST_RIGHT) {
1434                         left = 0;
1435                         right -= 1;
1436                         for (;;) {
1437                                 if (left == right) {
1438                                         if (direction == DIRECTION_UP)
1439                                                 subtract_one = true;
1440
1441                                         i = left;
1442                                         goto found;
1443                                 }
1444
1445                                 assert(left < right);
1446
1447                                 i = (left + right) / 2;
1448                                 p = le64toh(array->entry_array.items[i]);
1449                                 if (p <= 0)
1450                                         return -EBADMSG;
1451
1452                                 r = test_object(f, p, needle);
1453                                 if (r < 0)
1454                                         return r;
1455
1456                                 if (r == TEST_FOUND)
1457                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1458
1459                                 if (r == TEST_RIGHT)
1460                                         right = i;
1461                                 else
1462                                         left = i + 1;
1463                         }
1464                 }
1465
1466                 if (k > n) {
1467                         if (direction == DIRECTION_UP) {
1468                                 i = n;
1469                                 subtract_one = true;
1470                                 goto found;
1471                         }
1472
1473                         return 0;
1474                 }
1475
1476                 last_p = lp;
1477
1478                 n -= k;
1479                 t += k;
1480                 a = le64toh(array->entry_array.next_entry_array_offset);
1481         }
1482
1483         return 0;
1484
1485 found:
1486         if (subtract_one && t == 0 && i == 0)
1487                 return 0;
1488
1489         if (subtract_one && i == 0)
1490                 p = last_p;
1491         else if (subtract_one)
1492                 p = le64toh(array->entry_array.items[i-1]);
1493         else
1494                 p = le64toh(array->entry_array.items[i]);
1495
1496         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1497         if (r < 0)
1498                 return r;
1499
1500         if (ret)
1501                 *ret = o;
1502
1503         if (offset)
1504                 *offset = p;
1505
1506         if (idx)
1507                 *idx = t + i + (subtract_one ? -1 : 0);
1508
1509         return 1;
1510 }
1511
1512 static int generic_array_bisect_plus_one(JournalFile *f,
1513                                          uint64_t extra,
1514                                          uint64_t first,
1515                                          uint64_t n,
1516                                          uint64_t needle,
1517                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1518                                          direction_t direction,
1519                                          Object **ret,
1520                                          uint64_t *offset,
1521                                          uint64_t *idx) {
1522
1523         int r;
1524         bool step_back = false;
1525         Object *o;
1526
1527         assert(f);
1528         assert(test_object);
1529
1530         if (n <= 0)
1531                 return 0;
1532
1533         /* This bisects the array in object 'first', but first checks
1534          * an extra  */
1535         r = test_object(f, extra, needle);
1536         if (r < 0)
1537                 return r;
1538
1539         if (r == TEST_FOUND)
1540                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1541
1542         /* if we are looking with DIRECTION_UP then we need to first
1543            see if in the actual array there is a matching entry, and
1544            return the last one of that. But if there isn't any we need
1545            to return this one. Hence remember this, and return it
1546            below. */
1547         if (r == TEST_LEFT)
1548                 step_back = direction == DIRECTION_UP;
1549
1550         if (r == TEST_RIGHT) {
1551                 if (direction == DIRECTION_DOWN)
1552                         goto found;
1553                 else
1554                         return 0;
1555         }
1556
1557         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1558
1559         if (r == 0 && step_back)
1560                 goto found;
1561
1562         if (r > 0 && idx)
1563                 (*idx) ++;
1564
1565         return r;
1566
1567 found:
1568         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1569         if (r < 0)
1570                 return r;
1571
1572         if (ret)
1573                 *ret = o;
1574
1575         if (offset)
1576                 *offset = extra;
1577
1578         if (idx)
1579                 *idx = 0;
1580
1581         return 1;
1582 }
1583
1584 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1585         assert(f);
1586         assert(p > 0);
1587
1588         if (p == needle)
1589                 return TEST_FOUND;
1590         else if (p < needle)
1591                 return TEST_LEFT;
1592         else
1593                 return TEST_RIGHT;
1594 }
1595
1596 int journal_file_move_to_entry_by_offset(
1597                 JournalFile *f,
1598                 uint64_t p,
1599                 direction_t direction,
1600                 Object **ret,
1601                 uint64_t *offset) {
1602
1603         return generic_array_bisect(f,
1604                                     le64toh(f->header->entry_array_offset),
1605                                     le64toh(f->header->n_entries),
1606                                     p,
1607                                     test_object_offset,
1608                                     direction,
1609                                     ret, offset, NULL);
1610 }
1611
1612
1613 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1614         Object *o;
1615         int r;
1616
1617         assert(f);
1618         assert(p > 0);
1619
1620         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1621         if (r < 0)
1622                 return r;
1623
1624         if (le64toh(o->entry.seqnum) == needle)
1625                 return TEST_FOUND;
1626         else if (le64toh(o->entry.seqnum) < needle)
1627                 return TEST_LEFT;
1628         else
1629                 return TEST_RIGHT;
1630 }
1631
1632 int journal_file_move_to_entry_by_seqnum(
1633                 JournalFile *f,
1634                 uint64_t seqnum,
1635                 direction_t direction,
1636                 Object **ret,
1637                 uint64_t *offset) {
1638
1639         return generic_array_bisect(f,
1640                                     le64toh(f->header->entry_array_offset),
1641                                     le64toh(f->header->n_entries),
1642                                     seqnum,
1643                                     test_object_seqnum,
1644                                     direction,
1645                                     ret, offset, NULL);
1646 }
1647
1648 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1649         Object *o;
1650         int r;
1651
1652         assert(f);
1653         assert(p > 0);
1654
1655         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1656         if (r < 0)
1657                 return r;
1658
1659         if (le64toh(o->entry.realtime) == needle)
1660                 return TEST_FOUND;
1661         else if (le64toh(o->entry.realtime) < needle)
1662                 return TEST_LEFT;
1663         else
1664                 return TEST_RIGHT;
1665 }
1666
1667 int journal_file_move_to_entry_by_realtime(
1668                 JournalFile *f,
1669                 uint64_t realtime,
1670                 direction_t direction,
1671                 Object **ret,
1672                 uint64_t *offset) {
1673
1674         return generic_array_bisect(f,
1675                                     le64toh(f->header->entry_array_offset),
1676                                     le64toh(f->header->n_entries),
1677                                     realtime,
1678                                     test_object_realtime,
1679                                     direction,
1680                                     ret, offset, NULL);
1681 }
1682
1683 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1684         Object *o;
1685         int r;
1686
1687         assert(f);
1688         assert(p > 0);
1689
1690         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1691         if (r < 0)
1692                 return r;
1693
1694         if (le64toh(o->entry.monotonic) == needle)
1695                 return TEST_FOUND;
1696         else if (le64toh(o->entry.monotonic) < needle)
1697                 return TEST_LEFT;
1698         else
1699                 return TEST_RIGHT;
1700 }
1701
1702 int journal_file_move_to_entry_by_monotonic(
1703                 JournalFile *f,
1704                 sd_id128_t boot_id,
1705                 uint64_t monotonic,
1706                 direction_t direction,
1707                 Object **ret,
1708                 uint64_t *offset) {
1709
1710         char t[9+32+1] = "_BOOT_ID=";
1711         Object *o;
1712         int r;
1713
1714         assert(f);
1715
1716         sd_id128_to_string(boot_id, t + 9);
1717         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1718         if (r < 0)
1719                 return r;
1720         if (r == 0)
1721                 return -ENOENT;
1722
1723         return generic_array_bisect_plus_one(f,
1724                                              le64toh(o->data.entry_offset),
1725                                              le64toh(o->data.entry_array_offset),
1726                                              le64toh(o->data.n_entries),
1727                                              monotonic,
1728                                              test_object_monotonic,
1729                                              direction,
1730                                              ret, offset, NULL);
1731 }
1732
1733 int journal_file_next_entry(
1734                 JournalFile *f,
1735                 Object *o, uint64_t p,
1736                 direction_t direction,
1737                 Object **ret, uint64_t *offset) {
1738
1739         uint64_t i, n;
1740         int r;
1741
1742         assert(f);
1743         assert(p > 0 || !o);
1744
1745         n = le64toh(f->header->n_entries);
1746         if (n <= 0)
1747                 return 0;
1748
1749         if (!o)
1750                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1751         else {
1752                 if (o->object.type != OBJECT_ENTRY)
1753                         return -EINVAL;
1754
1755                 r = generic_array_bisect(f,
1756                                          le64toh(f->header->entry_array_offset),
1757                                          le64toh(f->header->n_entries),
1758                                          p,
1759                                          test_object_offset,
1760                                          DIRECTION_DOWN,
1761                                          NULL, NULL,
1762                                          &i);
1763                 if (r <= 0)
1764                         return r;
1765
1766                 if (direction == DIRECTION_DOWN) {
1767                         if (i >= n - 1)
1768                                 return 0;
1769
1770                         i++;
1771                 } else {
1772                         if (i <= 0)
1773                                 return 0;
1774
1775                         i--;
1776                 }
1777         }
1778
1779         /* And jump to it */
1780         return generic_array_get(f,
1781                                  le64toh(f->header->entry_array_offset),
1782                                  i,
1783                                  ret, offset);
1784 }
1785
1786 int journal_file_skip_entry(
1787                 JournalFile *f,
1788                 Object *o, uint64_t p,
1789                 int64_t skip,
1790                 Object **ret, uint64_t *offset) {
1791
1792         uint64_t i, n;
1793         int r;
1794
1795         assert(f);
1796         assert(o);
1797         assert(p > 0);
1798
1799         if (o->object.type != OBJECT_ENTRY)
1800                 return -EINVAL;
1801
1802         r = generic_array_bisect(f,
1803                                  le64toh(f->header->entry_array_offset),
1804                                  le64toh(f->header->n_entries),
1805                                  p,
1806                                  test_object_offset,
1807                                  DIRECTION_DOWN,
1808                                  NULL, NULL,
1809                                  &i);
1810         if (r <= 0)
1811                 return r;
1812
1813         /* Calculate new index */
1814         if (skip < 0) {
1815                 if ((uint64_t) -skip >= i)
1816                         i = 0;
1817                 else
1818                         i = i - (uint64_t) -skip;
1819         } else
1820                 i  += (uint64_t) skip;
1821
1822         n = le64toh(f->header->n_entries);
1823         if (n <= 0)
1824                 return -EBADMSG;
1825
1826         if (i >= n)
1827                 i = n-1;
1828
1829         return generic_array_get(f,
1830                                  le64toh(f->header->entry_array_offset),
1831                                  i,
1832                                  ret, offset);
1833 }
1834
1835 int journal_file_next_entry_for_data(
1836                 JournalFile *f,
1837                 Object *o, uint64_t p,
1838                 uint64_t data_offset,
1839                 direction_t direction,
1840                 Object **ret, uint64_t *offset) {
1841
1842         uint64_t n, i;
1843         int r;
1844         Object *d;
1845
1846         assert(f);
1847         assert(p > 0 || !o);
1848
1849         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1850         if (r < 0)
1851                 return r;
1852
1853         n = le64toh(d->data.n_entries);
1854         if (n <= 0)
1855                 return n;
1856
1857         if (!o)
1858                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1859         else {
1860                 if (o->object.type != OBJECT_ENTRY)
1861                         return -EINVAL;
1862
1863                 r = generic_array_bisect_plus_one(f,
1864                                                   le64toh(d->data.entry_offset),
1865                                                   le64toh(d->data.entry_array_offset),
1866                                                   le64toh(d->data.n_entries),
1867                                                   p,
1868                                                   test_object_offset,
1869                                                   DIRECTION_DOWN,
1870                                                   NULL, NULL,
1871                                                   &i);
1872
1873                 if (r <= 0)
1874                         return r;
1875
1876                 if (direction == DIRECTION_DOWN) {
1877                         if (i >= n - 1)
1878                                 return 0;
1879
1880                         i++;
1881                 } else {
1882                         if (i <= 0)
1883                                 return 0;
1884
1885                         i--;
1886                 }
1887
1888         }
1889
1890         return generic_array_get_plus_one(f,
1891                                           le64toh(d->data.entry_offset),
1892                                           le64toh(d->data.entry_array_offset),
1893                                           i,
1894                                           ret, offset);
1895 }
1896
1897 int journal_file_move_to_entry_by_offset_for_data(
1898                 JournalFile *f,
1899                 uint64_t data_offset,
1900                 uint64_t p,
1901                 direction_t direction,
1902                 Object **ret, uint64_t *offset) {
1903
1904         int r;
1905         Object *d;
1906
1907         assert(f);
1908
1909         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1910         if (r < 0)
1911                 return r;
1912
1913         return generic_array_bisect_plus_one(f,
1914                                              le64toh(d->data.entry_offset),
1915                                              le64toh(d->data.entry_array_offset),
1916                                              le64toh(d->data.n_entries),
1917                                              p,
1918                                              test_object_offset,
1919                                              direction,
1920                                              ret, offset, NULL);
1921 }
1922
1923 int journal_file_move_to_entry_by_monotonic_for_data(
1924                 JournalFile *f,
1925                 uint64_t data_offset,
1926                 sd_id128_t boot_id,
1927                 uint64_t monotonic,
1928                 direction_t direction,
1929                 Object **ret, uint64_t *offset) {
1930
1931         char t[9+32+1] = "_BOOT_ID=";
1932         Object *o, *d;
1933         int r;
1934         uint64_t b, z;
1935
1936         assert(f);
1937
1938         /* First, seek by time */
1939         sd_id128_to_string(boot_id, t + 9);
1940         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1941         if (r < 0)
1942                 return r;
1943         if (r == 0)
1944                 return -ENOENT;
1945
1946         r = generic_array_bisect_plus_one(f,
1947                                           le64toh(o->data.entry_offset),
1948                                           le64toh(o->data.entry_array_offset),
1949                                           le64toh(o->data.n_entries),
1950                                           monotonic,
1951                                           test_object_monotonic,
1952                                           direction,
1953                                           NULL, &z, NULL);
1954         if (r <= 0)
1955                 return r;
1956
1957         /* And now, continue seeking until we find an entry that
1958          * exists in both bisection arrays */
1959
1960         for (;;) {
1961                 Object *qo;
1962                 uint64_t p, q;
1963
1964                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1965                 if (r < 0)
1966                         return r;
1967
1968                 r = generic_array_bisect_plus_one(f,
1969                                                   le64toh(d->data.entry_offset),
1970                                                   le64toh(d->data.entry_array_offset),
1971                                                   le64toh(d->data.n_entries),
1972                                                   z,
1973                                                   test_object_offset,
1974                                                   direction,
1975                                                   NULL, &p, NULL);
1976                 if (r <= 0)
1977                         return r;
1978
1979                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1980                 if (r < 0)
1981                         return r;
1982
1983                 r = generic_array_bisect_plus_one(f,
1984                                                   le64toh(o->data.entry_offset),
1985                                                   le64toh(o->data.entry_array_offset),
1986                                                   le64toh(o->data.n_entries),
1987                                                   p,
1988                                                   test_object_offset,
1989                                                   direction,
1990                                                   &qo, &q, NULL);
1991
1992                 if (r <= 0)
1993                         return r;
1994
1995                 if (p == q) {
1996                         if (ret)
1997                                 *ret = qo;
1998                         if (offset)
1999                                 *offset = q;
2000
2001                         return 1;
2002                 }
2003
2004                 z = q;
2005         }
2006
2007         return 0;
2008 }
2009
2010 int journal_file_move_to_entry_by_seqnum_for_data(
2011                 JournalFile *f,
2012                 uint64_t data_offset,
2013                 uint64_t seqnum,
2014                 direction_t direction,
2015                 Object **ret, uint64_t *offset) {
2016
2017         Object *d;
2018         int r;
2019
2020         assert(f);
2021
2022         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2023         if (r < 0)
2024                 return r;
2025
2026         return generic_array_bisect_plus_one(f,
2027                                              le64toh(d->data.entry_offset),
2028                                              le64toh(d->data.entry_array_offset),
2029                                              le64toh(d->data.n_entries),
2030                                              seqnum,
2031                                              test_object_seqnum,
2032                                              direction,
2033                                              ret, offset, NULL);
2034 }
2035
2036 int journal_file_move_to_entry_by_realtime_for_data(
2037                 JournalFile *f,
2038                 uint64_t data_offset,
2039                 uint64_t realtime,
2040                 direction_t direction,
2041                 Object **ret, uint64_t *offset) {
2042
2043         Object *d;
2044         int r;
2045
2046         assert(f);
2047
2048         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2049         if (r < 0)
2050                 return r;
2051
2052         return generic_array_bisect_plus_one(f,
2053                                              le64toh(d->data.entry_offset),
2054                                              le64toh(d->data.entry_array_offset),
2055                                              le64toh(d->data.n_entries),
2056                                              realtime,
2057                                              test_object_realtime,
2058                                              direction,
2059                                              ret, offset, NULL);
2060 }
2061
2062 void journal_file_dump(JournalFile *f) {
2063         Object *o;
2064         int r;
2065         uint64_t p;
2066
2067         assert(f);
2068
2069         journal_file_print_header(f);
2070
2071         p = le64toh(f->header->header_size);
2072         while (p != 0) {
2073                 r = journal_file_move_to_object(f, -1, p, &o);
2074                 if (r < 0)
2075                         goto fail;
2076
2077                 switch (o->object.type) {
2078
2079                 case OBJECT_UNUSED:
2080                         printf("Type: OBJECT_UNUSED\n");
2081                         break;
2082
2083                 case OBJECT_DATA:
2084                         printf("Type: OBJECT_DATA\n");
2085                         break;
2086
2087                 case OBJECT_FIELD:
2088                         printf("Type: OBJECT_FIELD\n");
2089                         break;
2090
2091                 case OBJECT_ENTRY:
2092                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
2093                                (unsigned long long) le64toh(o->entry.seqnum),
2094                                (unsigned long long) le64toh(o->entry.monotonic),
2095                                (unsigned long long) le64toh(o->entry.realtime));
2096                         break;
2097
2098                 case OBJECT_FIELD_HASH_TABLE:
2099                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2100                         break;
2101
2102                 case OBJECT_DATA_HASH_TABLE:
2103                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2104                         break;
2105
2106                 case OBJECT_ENTRY_ARRAY:
2107                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2108                         break;
2109
2110                 case OBJECT_TAG:
2111                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
2112                                (unsigned long long) le64toh(o->tag.seqnum),
2113                                (unsigned long long) le64toh(o->tag.epoch));
2114                         break;
2115
2116                 default:
2117                         printf("Type: unknown (%u)\n", o->object.type);
2118                         break;
2119                 }
2120
2121                 if (o->object.flags & OBJECT_COMPRESSED)
2122                         printf("Flags: COMPRESSED\n");
2123
2124                 if (p == le64toh(f->header->tail_object_offset))
2125                         p = 0;
2126                 else
2127                         p = p + ALIGN64(le64toh(o->object.size));
2128         }
2129
2130         return;
2131 fail:
2132         log_error("File corrupt");
2133 }
2134
2135 void journal_file_print_header(JournalFile *f) {
2136         char a[33], b[33], c[33];
2137         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2138         struct stat st;
2139         char bytes[FORMAT_BYTES_MAX];
2140
2141         assert(f);
2142
2143         printf("File Path: %s\n"
2144                "File ID: %s\n"
2145                "Machine ID: %s\n"
2146                "Boot ID: %s\n"
2147                "Sequential Number ID: %s\n"
2148                "State: %s\n"
2149                "Compatible Flags:%s%s\n"
2150                "Incompatible Flags:%s%s\n"
2151                "Header size: %llu\n"
2152                "Arena size: %llu\n"
2153                "Data Hash Table Size: %llu\n"
2154                "Field Hash Table Size: %llu\n"
2155                "Rotate Suggested: %s\n"
2156                "Head Sequential Number: %llu\n"
2157                "Tail Sequential Number: %llu\n"
2158                "Head Realtime Timestamp: %s\n"
2159                "Tail Realtime Timestamp: %s\n"
2160                "Objects: %llu\n"
2161                "Entry Objects: %llu\n",
2162                f->path,
2163                sd_id128_to_string(f->header->file_id, a),
2164                sd_id128_to_string(f->header->machine_id, b),
2165                sd_id128_to_string(f->header->boot_id, c),
2166                sd_id128_to_string(f->header->seqnum_id, c),
2167                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2168                f->header->state == STATE_ONLINE ? "ONLINE" :
2169                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2170                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2171                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2172                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2173                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2174                (unsigned long long) le64toh(f->header->header_size),
2175                (unsigned long long) le64toh(f->header->arena_size),
2176                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2177                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2178                yes_no(journal_file_rotate_suggested(f, 0)),
2179                (unsigned long long) le64toh(f->header->head_entry_seqnum),
2180                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2181                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2182                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2183                (unsigned long long) le64toh(f->header->n_objects),
2184                (unsigned long long) le64toh(f->header->n_entries));
2185
2186         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2187                 printf("Data Objects: %llu\n"
2188                        "Data Hash Table Fill: %.1f%%\n",
2189                        (unsigned long long) le64toh(f->header->n_data),
2190                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2191
2192         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2193                 printf("Field Objects: %llu\n"
2194                        "Field Hash Table Fill: %.1f%%\n",
2195                        (unsigned long long) le64toh(f->header->n_fields),
2196                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2197
2198         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2199                 printf("Tag Objects: %llu\n",
2200                        (unsigned long long) le64toh(f->header->n_tags));
2201         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2202                 printf("Entry Array Objects: %llu\n",
2203                        (unsigned long long) le64toh(f->header->n_entry_arrays));
2204
2205         if (fstat(f->fd, &st) >= 0)
2206                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2207 }
2208
2209 int journal_file_open(
2210                 const char *fname,
2211                 int flags,
2212                 mode_t mode,
2213                 bool compress,
2214                 bool seal,
2215                 JournalMetrics *metrics,
2216                 MMapCache *mmap_cache,
2217                 JournalFile *template,
2218                 JournalFile **ret) {
2219
2220         JournalFile *f;
2221         int r;
2222         bool newly_created = false;
2223
2224         assert(fname);
2225         assert(ret);
2226
2227         if ((flags & O_ACCMODE) != O_RDONLY &&
2228             (flags & O_ACCMODE) != O_RDWR)
2229                 return -EINVAL;
2230
2231         if (!endswith(fname, ".journal") &&
2232             !endswith(fname, ".journal~"))
2233                 return -EINVAL;
2234
2235         f = new0(JournalFile, 1);
2236         if (!f)
2237                 return -ENOMEM;
2238
2239         f->fd = -1;
2240         f->mode = mode;
2241
2242         f->flags = flags;
2243         f->prot = prot_from_flags(flags);
2244         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2245 #ifdef HAVE_XZ
2246         f->compress = compress;
2247 #endif
2248 #ifdef HAVE_GCRYPT
2249         f->seal = seal;
2250 #endif
2251
2252         if (mmap_cache)
2253                 f->mmap = mmap_cache_ref(mmap_cache);
2254         else {
2255                 f->mmap = mmap_cache_new();
2256                 if (!f->mmap) {
2257                         r = -ENOMEM;
2258                         goto fail;
2259                 }
2260         }
2261
2262         f->path = strdup(fname);
2263         if (!f->path) {
2264                 r = -ENOMEM;
2265                 goto fail;
2266         }
2267
2268         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2269         if (f->fd < 0) {
2270                 r = -errno;
2271                 goto fail;
2272         }
2273
2274         if (fstat(f->fd, &f->last_stat) < 0) {
2275                 r = -errno;
2276                 goto fail;
2277         }
2278
2279         if (f->last_stat.st_size == 0 && f->writable) {
2280 #ifdef HAVE_XATTR
2281                 uint64_t crtime;
2282
2283                 /* Let's attach the creation time to the journal file,
2284                  * so that the vacuuming code knows the age of this
2285                  * file even if the file might end up corrupted one
2286                  * day... Ideally we'd just use the creation time many
2287                  * file systems maintain for each file, but there is
2288                  * currently no usable API to query this, hence let's
2289                  * emulate this via extended attributes. If extended
2290                  * attributes are not supported we'll just skip this,
2291                  * and rely solely on mtime/atime/ctime of the file.*/
2292
2293                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2294                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2295 #endif
2296
2297 #ifdef HAVE_GCRYPT
2298                 /* Try to load the FSPRG state, and if we can't, then
2299                  * just don't do sealing */
2300                 if (f->seal) {
2301                         r = journal_file_fss_load(f);
2302                         if (r < 0)
2303                                 f->seal = false;
2304                 }
2305 #endif
2306
2307                 r = journal_file_init_header(f, template);
2308                 if (r < 0)
2309                         goto fail;
2310
2311                 if (fstat(f->fd, &f->last_stat) < 0) {
2312                         r = -errno;
2313                         goto fail;
2314                 }
2315
2316                 newly_created = true;
2317         }
2318
2319         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2320                 r = -EIO;
2321                 goto fail;
2322         }
2323
2324         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2325         if (f->header == MAP_FAILED) {
2326                 f->header = NULL;
2327                 r = -errno;
2328                 goto fail;
2329         }
2330
2331         if (!newly_created) {
2332                 r = journal_file_verify_header(f);
2333                 if (r < 0)
2334                         goto fail;
2335         }
2336
2337 #ifdef HAVE_GCRYPT
2338         if (!newly_created && f->writable) {
2339                 r = journal_file_fss_load(f);
2340                 if (r < 0)
2341                         goto fail;
2342         }
2343 #endif
2344
2345         if (f->writable) {
2346                 if (metrics) {
2347                         journal_default_metrics(metrics, f->fd);
2348                         f->metrics = *metrics;
2349                 } else if (template)
2350                         f->metrics = template->metrics;
2351
2352                 r = journal_file_refresh_header(f);
2353                 if (r < 0)
2354                         goto fail;
2355         }
2356
2357 #ifdef HAVE_GCRYPT
2358         r = journal_file_hmac_setup(f);
2359         if (r < 0)
2360                 goto fail;
2361 #endif
2362
2363         if (newly_created) {
2364                 r = journal_file_setup_field_hash_table(f);
2365                 if (r < 0)
2366                         goto fail;
2367
2368                 r = journal_file_setup_data_hash_table(f);
2369                 if (r < 0)
2370                         goto fail;
2371
2372 #ifdef HAVE_GCRYPT
2373                 r = journal_file_append_first_tag(f);
2374                 if (r < 0)
2375                         goto fail;
2376 #endif
2377         }
2378
2379         r = journal_file_map_field_hash_table(f);
2380         if (r < 0)
2381                 goto fail;
2382
2383         r = journal_file_map_data_hash_table(f);
2384         if (r < 0)
2385                 goto fail;
2386
2387         *ret = f;
2388         return 0;
2389
2390 fail:
2391         journal_file_close(f);
2392
2393         return r;
2394 }
2395
2396 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2397         char *p;
2398         size_t l;
2399         JournalFile *old_file, *new_file = NULL;
2400         int r;
2401
2402         assert(f);
2403         assert(*f);
2404
2405         old_file = *f;
2406
2407         if (!old_file->writable)
2408                 return -EINVAL;
2409
2410         if (!endswith(old_file->path, ".journal"))
2411                 return -EINVAL;
2412
2413         l = strlen(old_file->path);
2414
2415         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2416         if (!p)
2417                 return -ENOMEM;
2418
2419         memcpy(p, old_file->path, l - 8);
2420         p[l-8] = '@';
2421         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2422         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2423                  "-%016llx-%016llx.journal",
2424                  (unsigned long long) le64toh((*f)->header->head_entry_seqnum),
2425                  (unsigned long long) le64toh((*f)->header->head_entry_realtime));
2426
2427         r = rename(old_file->path, p);
2428         free(p);
2429
2430         if (r < 0)
2431                 return -errno;
2432
2433         old_file->header->state = STATE_ARCHIVED;
2434
2435         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2436         journal_file_close(old_file);
2437
2438         *f = new_file;
2439         return r;
2440 }
2441
2442 int journal_file_open_reliably(
2443                 const char *fname,
2444                 int flags,
2445                 mode_t mode,
2446                 bool compress,
2447                 bool seal,
2448                 JournalMetrics *metrics,
2449                 MMapCache *mmap_cache,
2450                 JournalFile *template,
2451                 JournalFile **ret) {
2452
2453         int r;
2454         size_t l;
2455         char *p;
2456
2457         r = journal_file_open(fname, flags, mode, compress, seal,
2458                               metrics, mmap_cache, template, ret);
2459         if (r != -EBADMSG && /* corrupted */
2460             r != -ENODATA && /* truncated */
2461             r != -EHOSTDOWN && /* other machine */
2462             r != -EPROTONOSUPPORT && /* incompatible feature */
2463             r != -EBUSY && /* unclean shutdown */
2464             r != -ESHUTDOWN /* already archived */)
2465                 return r;
2466
2467         if ((flags & O_ACCMODE) == O_RDONLY)
2468                 return r;
2469
2470         if (!(flags & O_CREAT))
2471                 return r;
2472
2473         if (!endswith(fname, ".journal"))
2474                 return r;
2475
2476         /* The file is corrupted. Rotate it away and try it again (but only once) */
2477
2478         l = strlen(fname);
2479         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2480                      (int) (l-8), fname,
2481                      (unsigned long long) now(CLOCK_REALTIME),
2482                      random_ull()) < 0)
2483                 return -ENOMEM;
2484
2485         r = rename(fname, p);
2486         free(p);
2487         if (r < 0)
2488                 return -errno;
2489
2490         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2491
2492         return journal_file_open(fname, flags, mode, compress, seal,
2493                                  metrics, mmap_cache, template, ret);
2494 }
2495
2496
2497 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2498         uint64_t i, n;
2499         uint64_t q, xor_hash = 0;
2500         int r;
2501         EntryItem *items;
2502         dual_timestamp ts;
2503
2504         assert(from);
2505         assert(to);
2506         assert(o);
2507         assert(p);
2508
2509         if (!to->writable)
2510                 return -EPERM;
2511
2512         ts.monotonic = le64toh(o->entry.monotonic);
2513         ts.realtime = le64toh(o->entry.realtime);
2514
2515         if (to->tail_entry_monotonic_valid &&
2516             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2517                 return -EINVAL;
2518
2519         n = journal_file_entry_n_items(o);
2520         items = alloca(sizeof(EntryItem) * n);
2521
2522         for (i = 0; i < n; i++) {
2523                 uint64_t l, h;
2524                 le64_t le_hash;
2525                 size_t t;
2526                 void *data;
2527                 Object *u;
2528
2529                 q = le64toh(o->entry.items[i].object_offset);
2530                 le_hash = o->entry.items[i].hash;
2531
2532                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2533                 if (r < 0)
2534                         return r;
2535
2536                 if (le_hash != o->data.hash)
2537                         return -EBADMSG;
2538
2539                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2540                 t = (size_t) l;
2541
2542                 /* We hit the limit on 32bit machines */
2543                 if ((uint64_t) t != l)
2544                         return -E2BIG;
2545
2546                 if (o->object.flags & OBJECT_COMPRESSED) {
2547 #ifdef HAVE_XZ
2548                         uint64_t rsize;
2549
2550                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2551                                 return -EBADMSG;
2552
2553                         data = from->compress_buffer;
2554                         l = rsize;
2555 #else
2556                         return -EPROTONOSUPPORT;
2557 #endif
2558                 } else
2559                         data = o->data.payload;
2560
2561                 r = journal_file_append_data(to, data, l, &u, &h);
2562                 if (r < 0)
2563                         return r;
2564
2565                 xor_hash ^= le64toh(u->data.hash);
2566                 items[i].object_offset = htole64(h);
2567                 items[i].hash = u->data.hash;
2568
2569                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2570                 if (r < 0)
2571                         return r;
2572         }
2573
2574         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2575 }
2576
2577 void journal_default_metrics(JournalMetrics *m, int fd) {
2578         uint64_t fs_size = 0;
2579         struct statvfs ss;
2580         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2581
2582         assert(m);
2583         assert(fd >= 0);
2584
2585         if (fstatvfs(fd, &ss) >= 0)
2586                 fs_size = ss.f_frsize * ss.f_blocks;
2587
2588         if (m->max_use == (uint64_t) -1) {
2589
2590                 if (fs_size > 0) {
2591                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2592
2593                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2594                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2595
2596                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2597                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2598                 } else
2599                         m->max_use = DEFAULT_MAX_USE_LOWER;
2600         } else {
2601                 m->max_use = PAGE_ALIGN(m->max_use);
2602
2603                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2604                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2605         }
2606
2607         if (m->max_size == (uint64_t) -1) {
2608                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2609
2610                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2611                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2612         } else
2613                 m->max_size = PAGE_ALIGN(m->max_size);
2614
2615         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2616                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2617
2618         if (m->max_size*2 > m->max_use)
2619                 m->max_use = m->max_size*2;
2620
2621         if (m->min_size == (uint64_t) -1)
2622                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2623         else {
2624                 m->min_size = PAGE_ALIGN(m->min_size);
2625
2626                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2627                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2628
2629                 if (m->min_size > m->max_size)
2630                         m->max_size = m->min_size;
2631         }
2632
2633         if (m->keep_free == (uint64_t) -1) {
2634
2635                 if (fs_size > 0) {
2636                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2637
2638                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2639                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2640
2641                 } else
2642                         m->keep_free = DEFAULT_KEEP_FREE;
2643         }
2644
2645         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2646                   format_bytes(a, sizeof(a), m->max_use),
2647                   format_bytes(b, sizeof(b), m->max_size),
2648                   format_bytes(c, sizeof(c), m->min_size),
2649                   format_bytes(d, sizeof(d), m->keep_free));
2650 }
2651
2652 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2653         assert(f);
2654         assert(from || to);
2655
2656         if (from) {
2657                 if (f->header->head_entry_realtime == 0)
2658                         return -ENOENT;
2659
2660                 *from = le64toh(f->header->head_entry_realtime);
2661         }
2662
2663         if (to) {
2664                 if (f->header->tail_entry_realtime == 0)
2665                         return -ENOENT;
2666
2667                 *to = le64toh(f->header->tail_entry_realtime);
2668         }
2669
2670         return 1;
2671 }
2672
2673 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2674         char t[9+32+1] = "_BOOT_ID=";
2675         Object *o;
2676         uint64_t p;
2677         int r;
2678
2679         assert(f);
2680         assert(from || to);
2681
2682         sd_id128_to_string(boot_id, t + 9);
2683
2684         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2685         if (r <= 0)
2686                 return r;
2687
2688         if (le64toh(o->data.n_entries) <= 0)
2689                 return 0;
2690
2691         if (from) {
2692                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2693                 if (r < 0)
2694                         return r;
2695
2696                 *from = le64toh(o->entry.monotonic);
2697         }
2698
2699         if (to) {
2700                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2701                 if (r < 0)
2702                         return r;
2703
2704                 r = generic_array_get_plus_one(f,
2705                                                le64toh(o->data.entry_offset),
2706                                                le64toh(o->data.entry_array_offset),
2707                                                le64toh(o->data.n_entries)-1,
2708                                                &o, NULL);
2709                 if (r <= 0)
2710                         return r;
2711
2712                 *to = le64toh(o->entry.monotonic);
2713         }
2714
2715         return 1;
2716 }
2717
2718 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2719         assert(f);
2720
2721         /* If we gained new header fields we gained new features,
2722          * hence suggest a rotation */
2723         if (le64toh(f->header->header_size) < sizeof(Header)) {
2724                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2725                 return true;
2726         }
2727
2728         /* Let's check if the hash tables grew over a certain fill
2729          * level (75%, borrowing this value from Java's hash table
2730          * implementation), and if so suggest a rotation. To calculate
2731          * the fill level we need the n_data field, which only exists
2732          * in newer versions. */
2733
2734         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2735                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2736                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2737                                   f->path,
2738                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2739                                   (unsigned long long) le64toh(f->header->n_data),
2740                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2741                                   (unsigned long long) (f->last_stat.st_size),
2742                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2743                         return true;
2744                 }
2745
2746         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2747                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2748                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2749                                   f->path,
2750                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2751                                   (unsigned long long) le64toh(f->header->n_fields),
2752                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2753                         return true;
2754                 }
2755
2756         /* Are the data objects properly indexed by field objects? */
2757         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2758             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2759             le64toh(f->header->n_data) > 0 &&
2760             le64toh(f->header->n_fields) == 0)
2761                 return true;
2762
2763         if (max_file_usec > 0) {
2764                 usec_t t, h;
2765
2766                 h = le64toh(f->header->head_entry_realtime);
2767                 t = now(CLOCK_REALTIME);
2768
2769                 if (h > 0 && t > h + max_file_usec)
2770                         return true;
2771         }
2772
2773         return false;
2774 }