chiark / gitweb /
9e89cb96992250c748d5dd3f990e2213d8d1ac09
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75         assert(f);
76
77         if (!f->writable)
78                 return -EPERM;
79
80         if (!(f->fd >= 0 && f->header))
81                 return -EINVAL;
82
83         switch(f->header->state) {
84                 case STATE_ONLINE:
85                         return 0;
86
87                 case STATE_OFFLINE:
88                         f->header->state = STATE_ONLINE;
89                         fsync(f->fd);
90                         return 0;
91
92                 default:
93                         return -EINVAL;
94         }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98         assert(f);
99
100         if (!f->writable)
101                 return -EPERM;
102
103         if (!(f->fd >= 0 && f->header))
104                 return -EINVAL;
105
106         if (f->header->state != STATE_ONLINE)
107                 return 0;
108
109         fsync(f->fd);
110
111         f->header->state = STATE_OFFLINE;
112
113         fsync(f->fd);
114
115         return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119         assert(f);
120
121 #ifdef HAVE_GCRYPT
122         /* Write the final tag */
123         if (f->seal && f->writable)
124                 journal_file_append_tag(f);
125 #endif
126
127         /* Sync everything to disk, before we mark the file offline */
128         if (f->mmap && f->fd >= 0)
129                 mmap_cache_close_fd(f->mmap, f->fd);
130
131         journal_file_set_offline(f);
132
133         if (f->header)
134                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136         if (f->fd >= 0)
137                 close_nointr_nofail(f->fd);
138
139         free(f->path);
140
141         if (f->mmap)
142                 mmap_cache_unref(f->mmap);
143
144         hashmap_free_free(f->chain_cache);
145
146 #ifdef HAVE_XZ
147         free(f->compress_buffer);
148 #endif
149
150 #ifdef HAVE_GCRYPT
151         if (f->fss_file)
152                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153         else if (f->fsprg_state)
154                 free(f->fsprg_state);
155
156         free(f->fsprg_seed);
157
158         if (f->hmac)
159                 gcry_md_close(f->hmac);
160 #endif
161
162         free(f);
163 }
164
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166         Header h;
167         ssize_t k;
168         int r;
169
170         assert(f);
171
172         zero(h);
173         memcpy(h.signature, HEADER_SIGNATURE, 8);
174         h.header_size = htole64(ALIGN64(sizeof(h)));
175
176         h.incompatible_flags =
177                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179         h.compatible_flags =
180                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
181
182         r = sd_id128_randomize(&h.file_id);
183         if (r < 0)
184                 return r;
185
186         if (template) {
187                 h.seqnum_id = template->header->seqnum_id;
188                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
189         } else
190                 h.seqnum_id = h.file_id;
191
192         k = pwrite(f->fd, &h, sizeof(h), 0);
193         if (k < 0)
194                 return -errno;
195
196         if (k != sizeof(h))
197                 return -EIO;
198
199         return 0;
200 }
201
202 static int journal_file_refresh_header(JournalFile *f) {
203         int r;
204         sd_id128_t boot_id;
205
206         assert(f);
207
208         r = sd_id128_get_machine(&f->header->machine_id);
209         if (r < 0)
210                 return r;
211
212         r = sd_id128_get_boot(&boot_id);
213         if (r < 0)
214                 return r;
215
216         if (sd_id128_equal(boot_id, f->header->boot_id))
217                 f->tail_entry_monotonic_valid = true;
218
219         f->header->boot_id = boot_id;
220
221         journal_file_set_online(f);
222
223         /* Sync the online state to disk */
224         fsync(f->fd);
225
226         return 0;
227 }
228
229 static int journal_file_verify_header(JournalFile *f) {
230         assert(f);
231
232         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233                 return -EBADMSG;
234
235         /* In both read and write mode we refuse to open files with
236          * incompatible flags we don't know */
237 #ifdef HAVE_XZ
238         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239                 return -EPROTONOSUPPORT;
240 #else
241         if (f->header->incompatible_flags != 0)
242                 return -EPROTONOSUPPORT;
243 #endif
244
245         /* When open for writing we refuse to open files with
246          * compatible flags, too */
247         if (f->writable) {
248 #ifdef HAVE_GCRYPT
249                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250                         return -EPROTONOSUPPORT;
251 #else
252                 if (f->header->compatible_flags != 0)
253                         return -EPROTONOSUPPORT;
254 #endif
255         }
256
257         if (f->header->state >= _STATE_MAX)
258                 return -EBADMSG;
259
260         /* The first addition was n_data, so check that we are at least this large */
261         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262                 return -EBADMSG;
263
264         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265                 return -EBADMSG;
266
267         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268                 return -ENODATA;
269
270         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271                 return -ENODATA;
272
273         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275             !VALID64(le64toh(f->header->tail_object_offset)) ||
276             !VALID64(le64toh(f->header->entry_array_offset)))
277                 return -ENODATA;
278
279         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
283                 return -ENODATA;
284
285         if (f->writable) {
286                 uint8_t state;
287                 sd_id128_t machine_id;
288                 int r;
289
290                 r = sd_id128_get_machine(&machine_id);
291                 if (r < 0)
292                         return r;
293
294                 if (!sd_id128_equal(machine_id, f->header->machine_id))
295                         return -EHOSTDOWN;
296
297                 state = f->header->state;
298
299                 if (state == STATE_ONLINE) {
300                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301                         return -EBUSY;
302                 } else if (state == STATE_ARCHIVED)
303                         return -ESHUTDOWN;
304                 else if (state != STATE_OFFLINE) {
305                         log_debug("Journal file %s has unknown state %u.", f->path, state);
306                         return -EBUSY;
307                 }
308         }
309
310         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
311
312         f->seal = JOURNAL_HEADER_SEALED(f->header);
313
314         return 0;
315 }
316
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318         uint64_t old_size, new_size;
319         int r;
320
321         assert(f);
322
323         /* We assume that this file is not sparse, and we know that
324          * for sure, since we always call posix_fallocate()
325          * ourselves */
326
327         old_size =
328                 le64toh(f->header->header_size) +
329                 le64toh(f->header->arena_size);
330
331         new_size = PAGE_ALIGN(offset + size);
332         if (new_size < le64toh(f->header->header_size))
333                 new_size = le64toh(f->header->header_size);
334
335         if (new_size <= old_size)
336                 return 0;
337
338         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339                 return -E2BIG;
340
341         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Increase by larger blocks at once */
360         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362                 new_size = f->metrics.max_size;
363
364         /* Note that the glibc fallocate() fallback is very
365            inefficient, hence we try to minimize the allocation area
366            as we can. */
367         r = posix_fallocate(f->fd, old_size, new_size - old_size);
368         if (r != 0)
369                 return -r;
370
371         if (fstat(f->fd, &f->last_stat) < 0)
372                 return -errno;
373
374         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
375
376         return 0;
377 }
378
379 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
380         assert(f);
381         assert(ret);
382
383         if (size <= 0)
384                 return -EINVAL;
385
386         /* Avoid SIGBUS on invalid accesses */
387         if (offset + size > (uint64_t) f->last_stat.st_size) {
388                 /* Hmm, out of range? Let's refresh the fstat() data
389                  * first, before we trust that check. */
390
391                 if (fstat(f->fd, &f->last_stat) < 0 ||
392                     offset + size > (uint64_t) f->last_stat.st_size)
393                         return -EADDRNOTAVAIL;
394         }
395
396         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 }
398
399 static uint64_t minimum_header_size(Object *o) {
400
401         static const uint64_t table[] = {
402                 [OBJECT_DATA] = sizeof(DataObject),
403                 [OBJECT_FIELD] = sizeof(FieldObject),
404                 [OBJECT_ENTRY] = sizeof(EntryObject),
405                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408                 [OBJECT_TAG] = sizeof(TagObject),
409         };
410
411         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412                 return sizeof(ObjectHeader);
413
414         return table[o->object.type];
415 }
416
417 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
418         int r;
419         void *t;
420         Object *o;
421         uint64_t s;
422         unsigned context;
423
424         assert(f);
425         assert(ret);
426
427         /* Objects may only be located at multiple of 64 bit */
428         if (!VALID64(offset))
429                 return -EFAULT;
430
431         /* One context for each type, plus one catch-all for the rest */
432         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
433
434         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
435         if (r < 0)
436                 return r;
437
438         o = (Object*) t;
439         s = le64toh(o->object.size);
440
441         if (s < sizeof(ObjectHeader))
442                 return -EBADMSG;
443
444         if (o->object.type <= OBJECT_UNUSED)
445                 return -EBADMSG;
446
447         if (s < minimum_header_size(o))
448                 return -EBADMSG;
449
450         if (type > 0 && o->object.type != type)
451                 return -EBADMSG;
452
453         if (s > sizeof(ObjectHeader)) {
454                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
455                 if (r < 0)
456                         return r;
457
458                 o = (Object*) t;
459         }
460
461         *ret = o;
462         return 0;
463 }
464
465 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
466         uint64_t r;
467
468         assert(f);
469
470         r = le64toh(f->header->tail_entry_seqnum) + 1;
471
472         if (seqnum) {
473                 /* If an external seqnum counter was passed, we update
474                  * both the local and the external one, and set it to
475                  * the maximum of both */
476
477                 if (*seqnum + 1 > r)
478                         r = *seqnum + 1;
479
480                 *seqnum = r;
481         }
482
483         f->header->tail_entry_seqnum = htole64(r);
484
485         if (f->header->head_entry_seqnum == 0)
486                 f->header->head_entry_seqnum = htole64(r);
487
488         return r;
489 }
490
491 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
492         int r;
493         uint64_t p;
494         Object *tail, *o;
495         void *t;
496
497         assert(f);
498         assert(type > 0 && type < _OBJECT_TYPE_MAX);
499         assert(size >= sizeof(ObjectHeader));
500         assert(offset);
501         assert(ret);
502
503         r = journal_file_set_online(f);
504         if (r < 0)
505                 return r;
506
507         p = le64toh(f->header->tail_object_offset);
508         if (p == 0)
509                 p = le64toh(f->header->header_size);
510         else {
511                 r = journal_file_move_to_object(f, -1, p, &tail);
512                 if (r < 0)
513                         return r;
514
515                 p += ALIGN64(le64toh(tail->object.size));
516         }
517
518         r = journal_file_allocate(f, p, size);
519         if (r < 0)
520                 return r;
521
522         r = journal_file_move_to(f, type, false, p, size, &t);
523         if (r < 0)
524                 return r;
525
526         o = (Object*) t;
527
528         zero(o->object);
529         o->object.type = type;
530         o->object.size = htole64(size);
531
532         f->header->tail_object_offset = htole64(p);
533         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
534
535         *ret = o;
536         *offset = p;
537
538         return 0;
539 }
540
541 static int journal_file_setup_data_hash_table(JournalFile *f) {
542         uint64_t s, p;
543         Object *o;
544         int r;
545
546         assert(f);
547
548         /* We estimate that we need 1 hash table entry per 768 of
549            journal file and we want to make sure we never get beyond
550            75% fill level. Calculate the hash table size for the
551            maximum file size based on these metrics. */
552
553         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
554         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
556
557         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
558
559         r = journal_file_append_object(f,
560                                        OBJECT_DATA_HASH_TABLE,
561                                        offsetof(Object, hash_table.items) + s,
562                                        &o, &p);
563         if (r < 0)
564                 return r;
565
566         memset(o->hash_table.items, 0, s);
567
568         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569         f->header->data_hash_table_size = htole64(s);
570
571         return 0;
572 }
573
574 static int journal_file_setup_field_hash_table(JournalFile *f) {
575         uint64_t s, p;
576         Object *o;
577         int r;
578
579         assert(f);
580
581         /* We use a fixed size hash table for the fields as this
582          * number should grow very slowly only */
583
584         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585         r = journal_file_append_object(f,
586                                        OBJECT_FIELD_HASH_TABLE,
587                                        offsetof(Object, hash_table.items) + s,
588                                        &o, &p);
589         if (r < 0)
590                 return r;
591
592         memset(o->hash_table.items, 0, s);
593
594         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595         f->header->field_hash_table_size = htole64(s);
596
597         return 0;
598 }
599
600 static int journal_file_map_data_hash_table(JournalFile *f) {
601         uint64_t s, p;
602         void *t;
603         int r;
604
605         assert(f);
606
607         p = le64toh(f->header->data_hash_table_offset);
608         s = le64toh(f->header->data_hash_table_size);
609
610         r = journal_file_move_to(f,
611                                  OBJECT_DATA_HASH_TABLE,
612                                  true,
613                                  p, s,
614                                  &t);
615         if (r < 0)
616                 return r;
617
618         f->data_hash_table = t;
619         return 0;
620 }
621
622 static int journal_file_map_field_hash_table(JournalFile *f) {
623         uint64_t s, p;
624         void *t;
625         int r;
626
627         assert(f);
628
629         p = le64toh(f->header->field_hash_table_offset);
630         s = le64toh(f->header->field_hash_table_size);
631
632         r = journal_file_move_to(f,
633                                  OBJECT_FIELD_HASH_TABLE,
634                                  true,
635                                  p, s,
636                                  &t);
637         if (r < 0)
638                 return r;
639
640         f->field_hash_table = t;
641         return 0;
642 }
643
644 static int journal_file_link_field(
645                 JournalFile *f,
646                 Object *o,
647                 uint64_t offset,
648                 uint64_t hash) {
649
650         uint64_t p, h;
651         int r;
652
653         assert(f);
654         assert(o);
655         assert(offset > 0);
656
657         if (o->object.type != OBJECT_FIELD)
658                 return -EINVAL;
659
660         /* This might alter the window we are looking at */
661
662         o->field.next_hash_offset = o->field.head_data_offset = 0;
663
664         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665         p = le64toh(f->field_hash_table[h].tail_hash_offset);
666         if (p == 0)
667                 f->field_hash_table[h].head_hash_offset = htole64(offset);
668         else {
669                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
670                 if (r < 0)
671                         return r;
672
673                 o->field.next_hash_offset = htole64(offset);
674         }
675
676         f->field_hash_table[h].tail_hash_offset = htole64(offset);
677
678         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
680
681         return 0;
682 }
683
684 static int journal_file_link_data(
685                 JournalFile *f,
686                 Object *o,
687                 uint64_t offset,
688                 uint64_t hash) {
689
690         uint64_t p, h;
691         int r;
692
693         assert(f);
694         assert(o);
695         assert(offset > 0);
696
697         if (o->object.type != OBJECT_DATA)
698                 return -EINVAL;
699
700         /* This might alter the window we are looking at */
701
702         o->data.next_hash_offset = o->data.next_field_offset = 0;
703         o->data.entry_offset = o->data.entry_array_offset = 0;
704         o->data.n_entries = 0;
705
706         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
707         p = le64toh(f->data_hash_table[h].tail_hash_offset);
708         if (p == 0)
709                 /* Only entry in the hash table is easy */
710                 f->data_hash_table[h].head_hash_offset = htole64(offset);
711         else {
712                 /* Move back to the previous data object, to patch in
713                  * pointer */
714
715                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
716                 if (r < 0)
717                         return r;
718
719                 o->data.next_hash_offset = htole64(offset);
720         }
721
722         f->data_hash_table[h].tail_hash_offset = htole64(offset);
723
724         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
726
727         return 0;
728 }
729
730 int journal_file_find_field_object_with_hash(
731                 JournalFile *f,
732                 const void *field, uint64_t size, uint64_t hash,
733                 Object **ret, uint64_t *offset) {
734
735         uint64_t p, osize, h;
736         int r;
737
738         assert(f);
739         assert(field && size > 0);
740
741         osize = offsetof(Object, field.payload) + size;
742
743         if (f->header->field_hash_table_size == 0)
744                 return -EBADMSG;
745
746         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747         p = le64toh(f->field_hash_table[h].head_hash_offset);
748
749         while (p > 0) {
750                 Object *o;
751
752                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
753                 if (r < 0)
754                         return r;
755
756                 if (le64toh(o->field.hash) == hash &&
757                     le64toh(o->object.size) == osize &&
758                     memcmp(o->field.payload, field, size) == 0) {
759
760                         if (ret)
761                                 *ret = o;
762                         if (offset)
763                                 *offset = p;
764
765                         return 1;
766                 }
767
768                 p = le64toh(o->field.next_hash_offset);
769         }
770
771         return 0;
772 }
773
774 int journal_file_find_field_object(
775                 JournalFile *f,
776                 const void *field, uint64_t size,
777                 Object **ret, uint64_t *offset) {
778
779         uint64_t hash;
780
781         assert(f);
782         assert(field && size > 0);
783
784         hash = hash64(field, size);
785
786         return journal_file_find_field_object_with_hash(f,
787                                                         field, size, hash,
788                                                         ret, offset);
789 }
790
791 int journal_file_find_data_object_with_hash(
792                 JournalFile *f,
793                 const void *data, uint64_t size, uint64_t hash,
794                 Object **ret, uint64_t *offset) {
795
796         uint64_t p, osize, h;
797         int r;
798
799         assert(f);
800         assert(data || size == 0);
801
802         osize = offsetof(Object, data.payload) + size;
803
804         if (f->header->data_hash_table_size == 0)
805                 return -EBADMSG;
806
807         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808         p = le64toh(f->data_hash_table[h].head_hash_offset);
809
810         while (p > 0) {
811                 Object *o;
812
813                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
814                 if (r < 0)
815                         return r;
816
817                 if (le64toh(o->data.hash) != hash)
818                         goto next;
819
820                 if (o->object.flags & OBJECT_COMPRESSED) {
821 #ifdef HAVE_XZ
822                         uint64_t l, rsize;
823
824                         l = le64toh(o->object.size);
825                         if (l <= offsetof(Object, data.payload))
826                                 return -EBADMSG;
827
828                         l -= offsetof(Object, data.payload);
829
830                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
831                                 return -EBADMSG;
832
833                         if (rsize == size &&
834                             memcmp(f->compress_buffer, data, size) == 0) {
835
836                                 if (ret)
837                                         *ret = o;
838
839                                 if (offset)
840                                         *offset = p;
841
842                                 return 1;
843                         }
844 #else
845                         return -EPROTONOSUPPORT;
846 #endif
847
848                 } else if (le64toh(o->object.size) == osize &&
849                            memcmp(o->data.payload, data, size) == 0) {
850
851                         if (ret)
852                                 *ret = o;
853
854                         if (offset)
855                                 *offset = p;
856
857                         return 1;
858                 }
859
860         next:
861                 p = le64toh(o->data.next_hash_offset);
862         }
863
864         return 0;
865 }
866
867 int journal_file_find_data_object(
868                 JournalFile *f,
869                 const void *data, uint64_t size,
870                 Object **ret, uint64_t *offset) {
871
872         uint64_t hash;
873
874         assert(f);
875         assert(data || size == 0);
876
877         hash = hash64(data, size);
878
879         return journal_file_find_data_object_with_hash(f,
880                                                        data, size, hash,
881                                                        ret, offset);
882 }
883
884 static int journal_file_append_field(
885                 JournalFile *f,
886                 const void *field, uint64_t size,
887                 Object **ret, uint64_t *offset) {
888
889         uint64_t hash, p;
890         uint64_t osize;
891         Object *o;
892         int r;
893
894         assert(f);
895         assert(field && size > 0);
896
897         hash = hash64(field, size);
898
899         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
900         if (r < 0)
901                 return r;
902         else if (r > 0) {
903
904                 if (ret)
905                         *ret = o;
906
907                 if (offset)
908                         *offset = p;
909
910                 return 0;
911         }
912
913         osize = offsetof(Object, field.payload) + size;
914         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
915         if (r < 0)
916                 return r;
917
918         o->field.hash = htole64(hash);
919         memcpy(o->field.payload, field, size);
920
921         r = journal_file_link_field(f, o, p, hash);
922         if (r < 0)
923                 return r;
924
925         /* The linking might have altered the window, so let's
926          * refresh our pointer */
927         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
928         if (r < 0)
929                 return r;
930
931 #ifdef HAVE_GCRYPT
932         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
933         if (r < 0)
934                 return r;
935 #endif
936
937         if (ret)
938                 *ret = o;
939
940         if (offset)
941                 *offset = p;
942
943         return 0;
944 }
945
946 static int journal_file_append_data(
947                 JournalFile *f,
948                 const void *data, uint64_t size,
949                 Object **ret, uint64_t *offset) {
950
951         uint64_t hash, p;
952         uint64_t osize;
953         Object *o;
954         int r;
955         bool compressed = false;
956         const void *eq;
957
958         assert(f);
959         assert(data || size == 0);
960
961         hash = hash64(data, size);
962
963         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
964         if (r < 0)
965                 return r;
966         else if (r > 0) {
967
968                 if (ret)
969                         *ret = o;
970
971                 if (offset)
972                         *offset = p;
973
974                 return 0;
975         }
976
977         osize = offsetof(Object, data.payload) + size;
978         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
979         if (r < 0)
980                 return r;
981
982         o->data.hash = htole64(hash);
983
984 #ifdef HAVE_XZ
985         if (f->compress &&
986             size >= COMPRESSION_SIZE_THRESHOLD) {
987                 uint64_t rsize;
988
989                 compressed = compress_blob(data, size, o->data.payload, &rsize);
990
991                 if (compressed) {
992                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
993                         o->object.flags |= OBJECT_COMPRESSED;
994
995                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
996                 }
997         }
998 #endif
999
1000         if (!compressed && size > 0)
1001                 memcpy(o->data.payload, data, size);
1002
1003         r = journal_file_link_data(f, o, p, hash);
1004         if (r < 0)
1005                 return r;
1006
1007         /* The linking might have altered the window, so let's
1008          * refresh our pointer */
1009         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1010         if (r < 0)
1011                 return r;
1012
1013         if (!data)
1014                 eq = NULL;
1015         else
1016                 eq = memchr(data, '=', size);
1017         if (eq && eq > data) {
1018                 uint64_t fp;
1019                 Object *fo;
1020
1021                 /* Create field object ... */
1022                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1023                 if (r < 0)
1024                         return r;
1025
1026                 /* ... and link it in. */
1027                 o->data.next_field_offset = fo->field.head_data_offset;
1028                 fo->field.head_data_offset = le64toh(p);
1029         }
1030
1031 #ifdef HAVE_GCRYPT
1032         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1033         if (r < 0)
1034                 return r;
1035 #endif
1036
1037         if (ret)
1038                 *ret = o;
1039
1040         if (offset)
1041                 *offset = p;
1042
1043         return 0;
1044 }
1045
1046 uint64_t journal_file_entry_n_items(Object *o) {
1047         assert(o);
1048
1049         if (o->object.type != OBJECT_ENTRY)
1050                 return 0;
1051
1052         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1053 }
1054
1055 uint64_t journal_file_entry_array_n_items(Object *o) {
1056         assert(o);
1057
1058         if (o->object.type != OBJECT_ENTRY_ARRAY)
1059                 return 0;
1060
1061         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1062 }
1063
1064 uint64_t journal_file_hash_table_n_items(Object *o) {
1065         assert(o);
1066
1067         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068             o->object.type != OBJECT_FIELD_HASH_TABLE)
1069                 return 0;
1070
1071         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1072 }
1073
1074 static int link_entry_into_array(JournalFile *f,
1075                                  le64_t *first,
1076                                  le64_t *idx,
1077                                  uint64_t p) {
1078         int r;
1079         uint64_t n = 0, ap = 0, q, i, a, hidx;
1080         Object *o;
1081
1082         assert(f);
1083         assert(first);
1084         assert(idx);
1085         assert(p > 0);
1086
1087         a = le64toh(*first);
1088         i = hidx = le64toh(*idx);
1089         while (a > 0) {
1090
1091                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1092                 if (r < 0)
1093                         return r;
1094
1095                 n = journal_file_entry_array_n_items(o);
1096                 if (i < n) {
1097                         o->entry_array.items[i] = htole64(p);
1098                         *idx = htole64(hidx + 1);
1099                         return 0;
1100                 }
1101
1102                 i -= n;
1103                 ap = a;
1104                 a = le64toh(o->entry_array.next_entry_array_offset);
1105         }
1106
1107         if (hidx > n)
1108                 n = (hidx+1) * 2;
1109         else
1110                 n = n * 2;
1111
1112         if (n < 4)
1113                 n = 4;
1114
1115         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1117                                        &o, &q);
1118         if (r < 0)
1119                 return r;
1120
1121 #ifdef HAVE_GCRYPT
1122         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1123         if (r < 0)
1124                 return r;
1125 #endif
1126
1127         o->entry_array.items[i] = htole64(p);
1128
1129         if (ap == 0)
1130                 *first = htole64(q);
1131         else {
1132                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1133                 if (r < 0)
1134                         return r;
1135
1136                 o->entry_array.next_entry_array_offset = htole64(q);
1137         }
1138
1139         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1141
1142         *idx = htole64(hidx + 1);
1143
1144         return 0;
1145 }
1146
1147 static int link_entry_into_array_plus_one(JournalFile *f,
1148                                           le64_t *extra,
1149                                           le64_t *first,
1150                                           le64_t *idx,
1151                                           uint64_t p) {
1152
1153         int r;
1154
1155         assert(f);
1156         assert(extra);
1157         assert(first);
1158         assert(idx);
1159         assert(p > 0);
1160
1161         if (*idx == 0)
1162                 *extra = htole64(p);
1163         else {
1164                 le64_t i;
1165
1166                 i = htole64(le64toh(*idx) - 1);
1167                 r = link_entry_into_array(f, first, &i, p);
1168                 if (r < 0)
1169                         return r;
1170         }
1171
1172         *idx = htole64(le64toh(*idx) + 1);
1173         return 0;
1174 }
1175
1176 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1177         uint64_t p;
1178         int r;
1179         assert(f);
1180         assert(o);
1181         assert(offset > 0);
1182
1183         p = le64toh(o->entry.items[i].object_offset);
1184         if (p == 0)
1185                 return -EINVAL;
1186
1187         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1188         if (r < 0)
1189                 return r;
1190
1191         return link_entry_into_array_plus_one(f,
1192                                               &o->data.entry_offset,
1193                                               &o->data.entry_array_offset,
1194                                               &o->data.n_entries,
1195                                               offset);
1196 }
1197
1198 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1199         uint64_t n, i;
1200         int r;
1201
1202         assert(f);
1203         assert(o);
1204         assert(offset > 0);
1205
1206         if (o->object.type != OBJECT_ENTRY)
1207                 return -EINVAL;
1208
1209         __sync_synchronize();
1210
1211         /* Link up the entry itself */
1212         r = link_entry_into_array(f,
1213                                   &f->header->entry_array_offset,
1214                                   &f->header->n_entries,
1215                                   offset);
1216         if (r < 0)
1217                 return r;
1218
1219         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1220
1221         if (f->header->head_entry_realtime == 0)
1222                 f->header->head_entry_realtime = o->entry.realtime;
1223
1224         f->header->tail_entry_realtime = o->entry.realtime;
1225         f->header->tail_entry_monotonic = o->entry.monotonic;
1226
1227         f->tail_entry_monotonic_valid = true;
1228
1229         /* Link up the items */
1230         n = journal_file_entry_n_items(o);
1231         for (i = 0; i < n; i++) {
1232                 r = journal_file_link_entry_item(f, o, offset, i);
1233                 if (r < 0)
1234                         return r;
1235         }
1236
1237         return 0;
1238 }
1239
1240 static int journal_file_append_entry_internal(
1241                 JournalFile *f,
1242                 const dual_timestamp *ts,
1243                 uint64_t xor_hash,
1244                 const EntryItem items[], unsigned n_items,
1245                 uint64_t *seqnum,
1246                 Object **ret, uint64_t *offset) {
1247         uint64_t np;
1248         uint64_t osize;
1249         Object *o;
1250         int r;
1251
1252         assert(f);
1253         assert(items || n_items == 0);
1254         assert(ts);
1255
1256         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1257
1258         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1259         if (r < 0)
1260                 return r;
1261
1262         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1263         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1264         o->entry.realtime = htole64(ts->realtime);
1265         o->entry.monotonic = htole64(ts->monotonic);
1266         o->entry.xor_hash = htole64(xor_hash);
1267         o->entry.boot_id = f->header->boot_id;
1268
1269 #ifdef HAVE_GCRYPT
1270         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1271         if (r < 0)
1272                 return r;
1273 #endif
1274
1275         r = journal_file_link_entry(f, o, np);
1276         if (r < 0)
1277                 return r;
1278
1279         if (ret)
1280                 *ret = o;
1281
1282         if (offset)
1283                 *offset = np;
1284
1285         return 0;
1286 }
1287
1288 void journal_file_post_change(JournalFile *f) {
1289         assert(f);
1290
1291         /* inotify() does not receive IN_MODIFY events from file
1292          * accesses done via mmap(). After each access we hence
1293          * trigger IN_MODIFY by truncating the journal file to its
1294          * current size which triggers IN_MODIFY. */
1295
1296         __sync_synchronize();
1297
1298         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1299                 log_error("Failed to truncate file to its own size: %m");
1300 }
1301
1302 static int entry_item_cmp(const void *_a, const void *_b) {
1303         const EntryItem *a = _a, *b = _b;
1304
1305         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1306                 return -1;
1307         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1308                 return 1;
1309         return 0;
1310 }
1311
1312 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1313         unsigned i;
1314         EntryItem *items;
1315         int r;
1316         uint64_t xor_hash = 0;
1317         struct dual_timestamp _ts;
1318
1319         assert(f);
1320         assert(iovec || n_iovec == 0);
1321
1322         if (!ts) {
1323                 dual_timestamp_get(&_ts);
1324                 ts = &_ts;
1325         }
1326
1327         if (f->tail_entry_monotonic_valid &&
1328             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1329                 return -EINVAL;
1330
1331 #ifdef HAVE_GCRYPT
1332         r = journal_file_maybe_append_tag(f, ts->realtime);
1333         if (r < 0)
1334                 return r;
1335 #endif
1336
1337         /* alloca() can't take 0, hence let's allocate at least one */
1338         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1339
1340         for (i = 0; i < n_iovec; i++) {
1341                 uint64_t p;
1342                 Object *o;
1343
1344                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1345                 if (r < 0)
1346                         return r;
1347
1348                 xor_hash ^= le64toh(o->data.hash);
1349                 items[i].object_offset = htole64(p);
1350                 items[i].hash = o->data.hash;
1351         }
1352
1353         /* Order by the position on disk, in order to improve seek
1354          * times for rotating media. */
1355         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1356
1357         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1358
1359         journal_file_post_change(f);
1360
1361         return r;
1362 }
1363
1364 typedef struct ChainCacheItem {
1365         uint64_t first; /* the array at the begin of the chain */
1366         uint64_t array; /* the cached array */
1367         uint64_t begin; /* the first item in the cached array */
1368         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1369         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1370 } ChainCacheItem;
1371
1372 static void chain_cache_put(
1373                 Hashmap *h,
1374                 ChainCacheItem *ci,
1375                 uint64_t first,
1376                 uint64_t array,
1377                 uint64_t begin,
1378                 uint64_t total,
1379                 uint64_t last_index) {
1380
1381         if (!ci) {
1382                 /* If the chain item to cache for this chain is the
1383                  * first one it's not worth caching anything */
1384                 if (array == first)
1385                         return;
1386
1387                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1388                         ci = hashmap_steal_first(h);
1389                 else {
1390                         ci = new(ChainCacheItem, 1);
1391                         if (!ci)
1392                                 return;
1393                 }
1394
1395                 ci->first = first;
1396
1397                 if (hashmap_put(h, &ci->first, ci) < 0) {
1398                         free(ci);
1399                         return;
1400                 }
1401         } else
1402                 assert(ci->first == first);
1403
1404         ci->array = array;
1405         ci->begin = begin;
1406         ci->total = total;
1407         ci->last_index = last_index;
1408 }
1409
1410 static int generic_array_get(
1411                 JournalFile *f,
1412                 uint64_t first,
1413                 uint64_t i,
1414                 Object **ret, uint64_t *offset) {
1415
1416         Object *o;
1417         uint64_t p = 0, a, t = 0;
1418         int r;
1419         ChainCacheItem *ci;
1420
1421         assert(f);
1422
1423         a = first;
1424
1425         /* Try the chain cache first */
1426         ci = hashmap_get(f->chain_cache, &first);
1427         if (ci && i > ci->total) {
1428                 a = ci->array;
1429                 i -= ci->total;
1430                 t = ci->total;
1431         }
1432
1433         while (a > 0) {
1434                 uint64_t k;
1435
1436                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1437                 if (r < 0)
1438                         return r;
1439
1440                 k = journal_file_entry_array_n_items(o);
1441                 if (i < k) {
1442                         p = le64toh(o->entry_array.items[i]);
1443                         goto found;
1444                 }
1445
1446                 i -= k;
1447                 t += k;
1448                 a = le64toh(o->entry_array.next_entry_array_offset);
1449         }
1450
1451         return 0;
1452
1453 found:
1454         /* Let's cache this item for the next invocation */
1455         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t, i);
1456
1457         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1458         if (r < 0)
1459                 return r;
1460
1461         if (ret)
1462                 *ret = o;
1463
1464         if (offset)
1465                 *offset = p;
1466
1467         return 1;
1468 }
1469
1470 static int generic_array_get_plus_one(
1471                 JournalFile *f,
1472                 uint64_t extra,
1473                 uint64_t first,
1474                 uint64_t i,
1475                 Object **ret, uint64_t *offset) {
1476
1477         Object *o;
1478
1479         assert(f);
1480
1481         if (i == 0) {
1482                 int r;
1483
1484                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1485                 if (r < 0)
1486                         return r;
1487
1488                 if (ret)
1489                         *ret = o;
1490
1491                 if (offset)
1492                         *offset = extra;
1493
1494                 return 1;
1495         }
1496
1497         return generic_array_get(f, first, i-1, ret, offset);
1498 }
1499
1500 enum {
1501         TEST_FOUND,
1502         TEST_LEFT,
1503         TEST_RIGHT
1504 };
1505
1506 static int generic_array_bisect(
1507                 JournalFile *f,
1508                 uint64_t first,
1509                 uint64_t n,
1510                 uint64_t needle,
1511                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1512                 direction_t direction,
1513                 Object **ret,
1514                 uint64_t *offset,
1515                 uint64_t *idx) {
1516
1517         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1518         bool subtract_one = false;
1519         Object *o, *array = NULL;
1520         int r;
1521         ChainCacheItem *ci;
1522
1523         assert(f);
1524         assert(test_object);
1525
1526         /* Start with the first array in the chain */
1527         a = first;
1528
1529         ci = hashmap_get(f->chain_cache, &first);
1530         if (ci && n > ci->total) {
1531                 /* Ah, we have iterated this bisection array chain
1532                  * previously! Let's see if we can skip ahead in the
1533                  * chain, as far as the last time. But we can't jump
1534                  * backwards in the chain, so let's check that
1535                  * first. */
1536
1537                 r = test_object(f, ci->begin, needle);
1538                 if (r < 0)
1539                         return r;
1540
1541                 if (r == TEST_LEFT) {
1542                         /* OK, what we are looking for is right of the
1543                          * begin of this EntryArray, so let's jump
1544                          * straight to previously cached array in the
1545                          * chain */
1546
1547                         a = ci->array;
1548                         n -= ci->total;
1549                         t = ci->total;
1550                         last_index = ci->last_index;
1551                 }
1552         }
1553
1554         while (a > 0) {
1555                 uint64_t left, right, k, lp;
1556
1557                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1558                 if (r < 0)
1559                         return r;
1560
1561                 k = journal_file_entry_array_n_items(array);
1562                 right = MIN(k, n);
1563                 if (right <= 0)
1564                         return 0;
1565
1566                 i = right - 1;
1567                 lp = p = le64toh(array->entry_array.items[i]);
1568                 if (p <= 0)
1569                         return -EBADMSG;
1570
1571                 r = test_object(f, p, needle);
1572                 if (r < 0)
1573                         return r;
1574
1575                 if (r == TEST_FOUND)
1576                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1577
1578                 if (r == TEST_RIGHT) {
1579                         left = 0;
1580                         right -= 1;
1581
1582                         if (last_index != (uint64_t) -1) {
1583                                 assert(last_index <= right);
1584
1585                                 /* If we cached the last index we
1586                                  * looked at, let's try to not to jump
1587                                  * too wildly around and see if we can
1588                                  * limit the range to look at early to
1589                                  * the immediate neighbors of the last
1590                                  * index we looked at. */
1591
1592                                 if (last_index > 0) {
1593                                         uint64_t x = last_index - 1;
1594
1595                                         p = le64toh(array->entry_array.items[x]);
1596                                         if (p <= 0)
1597                                                 return -EBADMSG;
1598
1599                                         r = test_object(f, p, needle);
1600                                         if (r < 0)
1601                                                 return r;
1602
1603                                         if (r == TEST_FOUND)
1604                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1605
1606                                         if (r == TEST_RIGHT)
1607                                                 right = x;
1608                                         else
1609                                                 left = x + 1;
1610                                 }
1611
1612                                 if (last_index < right) {
1613                                         uint64_t y = last_index + 1;
1614
1615                                         p = le64toh(array->entry_array.items[y]);
1616                                         if (p <= 0)
1617                                                 return -EBADMSG;
1618
1619                                         r = test_object(f, p, needle);
1620                                         if (r < 0)
1621                                                 return r;
1622
1623                                         if (r == TEST_FOUND)
1624                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1625
1626                                         if (r == TEST_RIGHT)
1627                                                 right = y;
1628                                         else
1629                                                 left = y + 1;
1630                                 }
1631
1632                                 last_index = (uint64_t) -1;
1633                         }
1634
1635                         for (;;) {
1636                                 if (left == right) {
1637                                         if (direction == DIRECTION_UP)
1638                                                 subtract_one = true;
1639
1640                                         i = left;
1641                                         goto found;
1642                                 }
1643
1644                                 assert(left < right);
1645                                 i = (left + right) / 2;
1646
1647                                 p = le64toh(array->entry_array.items[i]);
1648                                 if (p <= 0)
1649                                         return -EBADMSG;
1650
1651                                 r = test_object(f, p, needle);
1652                                 if (r < 0)
1653                                         return r;
1654
1655                                 if (r == TEST_FOUND)
1656                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1657
1658                                 if (r == TEST_RIGHT)
1659                                         right = i;
1660                                 else
1661                                         left = i + 1;
1662                         }
1663                 }
1664
1665                 if (k > n) {
1666                         if (direction == DIRECTION_UP) {
1667                                 i = n;
1668                                 subtract_one = true;
1669                                 goto found;
1670                         }
1671
1672                         return 0;
1673                 }
1674
1675                 last_p = lp;
1676
1677                 n -= k;
1678                 t += k;
1679                 last_index = (uint64_t) -1;
1680                 a = le64toh(array->entry_array.next_entry_array_offset);
1681         }
1682
1683         return 0;
1684
1685 found:
1686         if (subtract_one && t == 0 && i == 0)
1687                 return 0;
1688
1689         /* Let's cache this item for the next invocation */
1690         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1691
1692         if (subtract_one && i == 0)
1693                 p = last_p;
1694         else if (subtract_one)
1695                 p = le64toh(array->entry_array.items[i-1]);
1696         else
1697                 p = le64toh(array->entry_array.items[i]);
1698
1699         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1700         if (r < 0)
1701                 return r;
1702
1703         if (ret)
1704                 *ret = o;
1705
1706         if (offset)
1707                 *offset = p;
1708
1709         if (idx)
1710                 *idx = t + i + (subtract_one ? -1 : 0);
1711
1712         return 1;
1713 }
1714
1715
1716 static int generic_array_bisect_plus_one(
1717                 JournalFile *f,
1718                 uint64_t extra,
1719                 uint64_t first,
1720                 uint64_t n,
1721                 uint64_t needle,
1722                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723                 direction_t direction,
1724                 Object **ret,
1725                 uint64_t *offset,
1726                 uint64_t *idx) {
1727
1728         int r;
1729         bool step_back = false;
1730         Object *o;
1731
1732         assert(f);
1733         assert(test_object);
1734
1735         if (n <= 0)
1736                 return 0;
1737
1738         /* This bisects the array in object 'first', but first checks
1739          * an extra  */
1740         r = test_object(f, extra, needle);
1741         if (r < 0)
1742                 return r;
1743
1744         if (r == TEST_FOUND)
1745                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1746
1747         /* if we are looking with DIRECTION_UP then we need to first
1748            see if in the actual array there is a matching entry, and
1749            return the last one of that. But if there isn't any we need
1750            to return this one. Hence remember this, and return it
1751            below. */
1752         if (r == TEST_LEFT)
1753                 step_back = direction == DIRECTION_UP;
1754
1755         if (r == TEST_RIGHT) {
1756                 if (direction == DIRECTION_DOWN)
1757                         goto found;
1758                 else
1759                         return 0;
1760         }
1761
1762         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1763
1764         if (r == 0 && step_back)
1765                 goto found;
1766
1767         if (r > 0 && idx)
1768                 (*idx) ++;
1769
1770         return r;
1771
1772 found:
1773         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1774         if (r < 0)
1775                 return r;
1776
1777         if (ret)
1778                 *ret = o;
1779
1780         if (offset)
1781                 *offset = extra;
1782
1783         if (idx)
1784                 *idx = 0;
1785
1786         return 1;
1787 }
1788
1789 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1790         assert(f);
1791         assert(p > 0);
1792
1793         if (p == needle)
1794                 return TEST_FOUND;
1795         else if (p < needle)
1796                 return TEST_LEFT;
1797         else
1798                 return TEST_RIGHT;
1799 }
1800
1801 int journal_file_move_to_entry_by_offset(
1802                 JournalFile *f,
1803                 uint64_t p,
1804                 direction_t direction,
1805                 Object **ret,
1806                 uint64_t *offset) {
1807
1808         return generic_array_bisect(f,
1809                                     le64toh(f->header->entry_array_offset),
1810                                     le64toh(f->header->n_entries),
1811                                     p,
1812                                     test_object_offset,
1813                                     direction,
1814                                     ret, offset, NULL);
1815 }
1816
1817
1818 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1819         Object *o;
1820         int r;
1821
1822         assert(f);
1823         assert(p > 0);
1824
1825         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1826         if (r < 0)
1827                 return r;
1828
1829         if (le64toh(o->entry.seqnum) == needle)
1830                 return TEST_FOUND;
1831         else if (le64toh(o->entry.seqnum) < needle)
1832                 return TEST_LEFT;
1833         else
1834                 return TEST_RIGHT;
1835 }
1836
1837 int journal_file_move_to_entry_by_seqnum(
1838                 JournalFile *f,
1839                 uint64_t seqnum,
1840                 direction_t direction,
1841                 Object **ret,
1842                 uint64_t *offset) {
1843
1844         return generic_array_bisect(f,
1845                                     le64toh(f->header->entry_array_offset),
1846                                     le64toh(f->header->n_entries),
1847                                     seqnum,
1848                                     test_object_seqnum,
1849                                     direction,
1850                                     ret, offset, NULL);
1851 }
1852
1853 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1854         Object *o;
1855         int r;
1856
1857         assert(f);
1858         assert(p > 0);
1859
1860         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1861         if (r < 0)
1862                 return r;
1863
1864         if (le64toh(o->entry.realtime) == needle)
1865                 return TEST_FOUND;
1866         else if (le64toh(o->entry.realtime) < needle)
1867                 return TEST_LEFT;
1868         else
1869                 return TEST_RIGHT;
1870 }
1871
1872 int journal_file_move_to_entry_by_realtime(
1873                 JournalFile *f,
1874                 uint64_t realtime,
1875                 direction_t direction,
1876                 Object **ret,
1877                 uint64_t *offset) {
1878
1879         return generic_array_bisect(f,
1880                                     le64toh(f->header->entry_array_offset),
1881                                     le64toh(f->header->n_entries),
1882                                     realtime,
1883                                     test_object_realtime,
1884                                     direction,
1885                                     ret, offset, NULL);
1886 }
1887
1888 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1889         Object *o;
1890         int r;
1891
1892         assert(f);
1893         assert(p > 0);
1894
1895         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1896         if (r < 0)
1897                 return r;
1898
1899         if (le64toh(o->entry.monotonic) == needle)
1900                 return TEST_FOUND;
1901         else if (le64toh(o->entry.monotonic) < needle)
1902                 return TEST_LEFT;
1903         else
1904                 return TEST_RIGHT;
1905 }
1906
1907 static inline int find_data_object_by_boot_id(
1908                 JournalFile *f,
1909                 sd_id128_t boot_id,
1910                 Object **o,
1911                 uint64_t *b) {
1912         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1913
1914         sd_id128_to_string(boot_id, t + 9);
1915         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1916 }
1917
1918 int journal_file_move_to_entry_by_monotonic(
1919                 JournalFile *f,
1920                 sd_id128_t boot_id,
1921                 uint64_t monotonic,
1922                 direction_t direction,
1923                 Object **ret,
1924                 uint64_t *offset) {
1925
1926         Object *o;
1927         int r;
1928
1929         assert(f);
1930
1931         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1932         if (r < 0)
1933                 return r;
1934         if (r == 0)
1935                 return -ENOENT;
1936
1937         return generic_array_bisect_plus_one(f,
1938                                              le64toh(o->data.entry_offset),
1939                                              le64toh(o->data.entry_array_offset),
1940                                              le64toh(o->data.n_entries),
1941                                              monotonic,
1942                                              test_object_monotonic,
1943                                              direction,
1944                                              ret, offset, NULL);
1945 }
1946
1947 int journal_file_next_entry(
1948                 JournalFile *f,
1949                 Object *o, uint64_t p,
1950                 direction_t direction,
1951                 Object **ret, uint64_t *offset) {
1952
1953         uint64_t i, n;
1954         int r;
1955
1956         assert(f);
1957         assert(p > 0 || !o);
1958
1959         n = le64toh(f->header->n_entries);
1960         if (n <= 0)
1961                 return 0;
1962
1963         if (!o)
1964                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1965         else {
1966                 if (o->object.type != OBJECT_ENTRY)
1967                         return -EINVAL;
1968
1969                 r = generic_array_bisect(f,
1970                                          le64toh(f->header->entry_array_offset),
1971                                          le64toh(f->header->n_entries),
1972                                          p,
1973                                          test_object_offset,
1974                                          DIRECTION_DOWN,
1975                                          NULL, NULL,
1976                                          &i);
1977                 if (r <= 0)
1978                         return r;
1979
1980                 if (direction == DIRECTION_DOWN) {
1981                         if (i >= n - 1)
1982                                 return 0;
1983
1984                         i++;
1985                 } else {
1986                         if (i <= 0)
1987                                 return 0;
1988
1989                         i--;
1990                 }
1991         }
1992
1993         /* And jump to it */
1994         return generic_array_get(f,
1995                                  le64toh(f->header->entry_array_offset),
1996                                  i,
1997                                  ret, offset);
1998 }
1999
2000 int journal_file_skip_entry(
2001                 JournalFile *f,
2002                 Object *o, uint64_t p,
2003                 int64_t skip,
2004                 Object **ret, uint64_t *offset) {
2005
2006         uint64_t i, n;
2007         int r;
2008
2009         assert(f);
2010         assert(o);
2011         assert(p > 0);
2012
2013         if (o->object.type != OBJECT_ENTRY)
2014                 return -EINVAL;
2015
2016         r = generic_array_bisect(f,
2017                                  le64toh(f->header->entry_array_offset),
2018                                  le64toh(f->header->n_entries),
2019                                  p,
2020                                  test_object_offset,
2021                                  DIRECTION_DOWN,
2022                                  NULL, NULL,
2023                                  &i);
2024         if (r <= 0)
2025                 return r;
2026
2027         /* Calculate new index */
2028         if (skip < 0) {
2029                 if ((uint64_t) -skip >= i)
2030                         i = 0;
2031                 else
2032                         i = i - (uint64_t) -skip;
2033         } else
2034                 i  += (uint64_t) skip;
2035
2036         n = le64toh(f->header->n_entries);
2037         if (n <= 0)
2038                 return -EBADMSG;
2039
2040         if (i >= n)
2041                 i = n-1;
2042
2043         return generic_array_get(f,
2044                                  le64toh(f->header->entry_array_offset),
2045                                  i,
2046                                  ret, offset);
2047 }
2048
2049 int journal_file_next_entry_for_data(
2050                 JournalFile *f,
2051                 Object *o, uint64_t p,
2052                 uint64_t data_offset,
2053                 direction_t direction,
2054                 Object **ret, uint64_t *offset) {
2055
2056         uint64_t n, i;
2057         int r;
2058         Object *d;
2059
2060         assert(f);
2061         assert(p > 0 || !o);
2062
2063         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2064         if (r < 0)
2065                 return r;
2066
2067         n = le64toh(d->data.n_entries);
2068         if (n <= 0)
2069                 return n;
2070
2071         if (!o)
2072                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2073         else {
2074                 if (o->object.type != OBJECT_ENTRY)
2075                         return -EINVAL;
2076
2077                 r = generic_array_bisect_plus_one(f,
2078                                                   le64toh(d->data.entry_offset),
2079                                                   le64toh(d->data.entry_array_offset),
2080                                                   le64toh(d->data.n_entries),
2081                                                   p,
2082                                                   test_object_offset,
2083                                                   DIRECTION_DOWN,
2084                                                   NULL, NULL,
2085                                                   &i);
2086
2087                 if (r <= 0)
2088                         return r;
2089
2090                 if (direction == DIRECTION_DOWN) {
2091                         if (i >= n - 1)
2092                                 return 0;
2093
2094                         i++;
2095                 } else {
2096                         if (i <= 0)
2097                                 return 0;
2098
2099                         i--;
2100                 }
2101
2102         }
2103
2104         return generic_array_get_plus_one(f,
2105                                           le64toh(d->data.entry_offset),
2106                                           le64toh(d->data.entry_array_offset),
2107                                           i,
2108                                           ret, offset);
2109 }
2110
2111 int journal_file_move_to_entry_by_offset_for_data(
2112                 JournalFile *f,
2113                 uint64_t data_offset,
2114                 uint64_t p,
2115                 direction_t direction,
2116                 Object **ret, uint64_t *offset) {
2117
2118         int r;
2119         Object *d;
2120
2121         assert(f);
2122
2123         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2124         if (r < 0)
2125                 return r;
2126
2127         return generic_array_bisect_plus_one(f,
2128                                              le64toh(d->data.entry_offset),
2129                                              le64toh(d->data.entry_array_offset),
2130                                              le64toh(d->data.n_entries),
2131                                              p,
2132                                              test_object_offset,
2133                                              direction,
2134                                              ret, offset, NULL);
2135 }
2136
2137 int journal_file_move_to_entry_by_monotonic_for_data(
2138                 JournalFile *f,
2139                 uint64_t data_offset,
2140                 sd_id128_t boot_id,
2141                 uint64_t monotonic,
2142                 direction_t direction,
2143                 Object **ret, uint64_t *offset) {
2144
2145         Object *o, *d;
2146         int r;
2147         uint64_t b, z;
2148
2149         assert(f);
2150
2151         /* First, seek by time */
2152         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2153         if (r < 0)
2154                 return r;
2155         if (r == 0)
2156                 return -ENOENT;
2157
2158         r = generic_array_bisect_plus_one(f,
2159                                           le64toh(o->data.entry_offset),
2160                                           le64toh(o->data.entry_array_offset),
2161                                           le64toh(o->data.n_entries),
2162                                           monotonic,
2163                                           test_object_monotonic,
2164                                           direction,
2165                                           NULL, &z, NULL);
2166         if (r <= 0)
2167                 return r;
2168
2169         /* And now, continue seeking until we find an entry that
2170          * exists in both bisection arrays */
2171
2172         for (;;) {
2173                 Object *qo;
2174                 uint64_t p, q;
2175
2176                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2177                 if (r < 0)
2178                         return r;
2179
2180                 r = generic_array_bisect_plus_one(f,
2181                                                   le64toh(d->data.entry_offset),
2182                                                   le64toh(d->data.entry_array_offset),
2183                                                   le64toh(d->data.n_entries),
2184                                                   z,
2185                                                   test_object_offset,
2186                                                   direction,
2187                                                   NULL, &p, NULL);
2188                 if (r <= 0)
2189                         return r;
2190
2191                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2192                 if (r < 0)
2193                         return r;
2194
2195                 r = generic_array_bisect_plus_one(f,
2196                                                   le64toh(o->data.entry_offset),
2197                                                   le64toh(o->data.entry_array_offset),
2198                                                   le64toh(o->data.n_entries),
2199                                                   p,
2200                                                   test_object_offset,
2201                                                   direction,
2202                                                   &qo, &q, NULL);
2203
2204                 if (r <= 0)
2205                         return r;
2206
2207                 if (p == q) {
2208                         if (ret)
2209                                 *ret = qo;
2210                         if (offset)
2211                                 *offset = q;
2212
2213                         return 1;
2214                 }
2215
2216                 z = q;
2217         }
2218
2219         return 0;
2220 }
2221
2222 int journal_file_move_to_entry_by_seqnum_for_data(
2223                 JournalFile *f,
2224                 uint64_t data_offset,
2225                 uint64_t seqnum,
2226                 direction_t direction,
2227                 Object **ret, uint64_t *offset) {
2228
2229         Object *d;
2230         int r;
2231
2232         assert(f);
2233
2234         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2235         if (r < 0)
2236                 return r;
2237
2238         return generic_array_bisect_plus_one(f,
2239                                              le64toh(d->data.entry_offset),
2240                                              le64toh(d->data.entry_array_offset),
2241                                              le64toh(d->data.n_entries),
2242                                              seqnum,
2243                                              test_object_seqnum,
2244                                              direction,
2245                                              ret, offset, NULL);
2246 }
2247
2248 int journal_file_move_to_entry_by_realtime_for_data(
2249                 JournalFile *f,
2250                 uint64_t data_offset,
2251                 uint64_t realtime,
2252                 direction_t direction,
2253                 Object **ret, uint64_t *offset) {
2254
2255         Object *d;
2256         int r;
2257
2258         assert(f);
2259
2260         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2261         if (r < 0)
2262                 return r;
2263
2264         return generic_array_bisect_plus_one(f,
2265                                              le64toh(d->data.entry_offset),
2266                                              le64toh(d->data.entry_array_offset),
2267                                              le64toh(d->data.n_entries),
2268                                              realtime,
2269                                              test_object_realtime,
2270                                              direction,
2271                                              ret, offset, NULL);
2272 }
2273
2274 void journal_file_dump(JournalFile *f) {
2275         Object *o;
2276         int r;
2277         uint64_t p;
2278
2279         assert(f);
2280
2281         journal_file_print_header(f);
2282
2283         p = le64toh(f->header->header_size);
2284         while (p != 0) {
2285                 r = journal_file_move_to_object(f, -1, p, &o);
2286                 if (r < 0)
2287                         goto fail;
2288
2289                 switch (o->object.type) {
2290
2291                 case OBJECT_UNUSED:
2292                         printf("Type: OBJECT_UNUSED\n");
2293                         break;
2294
2295                 case OBJECT_DATA:
2296                         printf("Type: OBJECT_DATA\n");
2297                         break;
2298
2299                 case OBJECT_FIELD:
2300                         printf("Type: OBJECT_FIELD\n");
2301                         break;
2302
2303                 case OBJECT_ENTRY:
2304                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2305                                le64toh(o->entry.seqnum),
2306                                le64toh(o->entry.monotonic),
2307                                le64toh(o->entry.realtime));
2308                         break;
2309
2310                 case OBJECT_FIELD_HASH_TABLE:
2311                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2312                         break;
2313
2314                 case OBJECT_DATA_HASH_TABLE:
2315                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2316                         break;
2317
2318                 case OBJECT_ENTRY_ARRAY:
2319                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2320                         break;
2321
2322                 case OBJECT_TAG:
2323                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2324                                le64toh(o->tag.seqnum),
2325                                le64toh(o->tag.epoch));
2326                         break;
2327
2328                 default:
2329                         printf("Type: unknown (%u)\n", o->object.type);
2330                         break;
2331                 }
2332
2333                 if (o->object.flags & OBJECT_COMPRESSED)
2334                         printf("Flags: COMPRESSED\n");
2335
2336                 if (p == le64toh(f->header->tail_object_offset))
2337                         p = 0;
2338                 else
2339                         p = p + ALIGN64(le64toh(o->object.size));
2340         }
2341
2342         return;
2343 fail:
2344         log_error("File corrupt");
2345 }
2346
2347 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2348         const char *x;
2349
2350         x = format_timestamp(buf, l, t);
2351         if (x)
2352                 return x;
2353         return " --- ";
2354 }
2355
2356 void journal_file_print_header(JournalFile *f) {
2357         char a[33], b[33], c[33], d[33];
2358         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2359         struct stat st;
2360         char bytes[FORMAT_BYTES_MAX];
2361
2362         assert(f);
2363
2364         printf("File Path: %s\n"
2365                "File ID: %s\n"
2366                "Machine ID: %s\n"
2367                "Boot ID: %s\n"
2368                "Sequential Number ID: %s\n"
2369                "State: %s\n"
2370                "Compatible Flags:%s%s\n"
2371                "Incompatible Flags:%s%s\n"
2372                "Header size: %"PRIu64"\n"
2373                "Arena size: %"PRIu64"\n"
2374                "Data Hash Table Size: %"PRIu64"\n"
2375                "Field Hash Table Size: %"PRIu64"\n"
2376                "Rotate Suggested: %s\n"
2377                "Head Sequential Number: %"PRIu64"\n"
2378                "Tail Sequential Number: %"PRIu64"\n"
2379                "Head Realtime Timestamp: %s\n"
2380                "Tail Realtime Timestamp: %s\n"
2381                "Tail Monotonic Timestamp: %s\n"
2382                "Objects: %"PRIu64"\n"
2383                "Entry Objects: %"PRIu64"\n",
2384                f->path,
2385                sd_id128_to_string(f->header->file_id, a),
2386                sd_id128_to_string(f->header->machine_id, b),
2387                sd_id128_to_string(f->header->boot_id, c),
2388                sd_id128_to_string(f->header->seqnum_id, d),
2389                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2390                f->header->state == STATE_ONLINE ? "ONLINE" :
2391                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2392                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2393                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2394                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2395                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2396                le64toh(f->header->header_size),
2397                le64toh(f->header->arena_size),
2398                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2399                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2400                yes_no(journal_file_rotate_suggested(f, 0)),
2401                le64toh(f->header->head_entry_seqnum),
2402                le64toh(f->header->tail_entry_seqnum),
2403                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2404                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2405                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2406                le64toh(f->header->n_objects),
2407                le64toh(f->header->n_entries));
2408
2409         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2410                 printf("Data Objects: %"PRIu64"\n"
2411                        "Data Hash Table Fill: %.1f%%\n",
2412                        le64toh(f->header->n_data),
2413                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2414
2415         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2416                 printf("Field Objects: %"PRIu64"\n"
2417                        "Field Hash Table Fill: %.1f%%\n",
2418                        le64toh(f->header->n_fields),
2419                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2420
2421         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2422                 printf("Tag Objects: %"PRIu64"\n",
2423                        le64toh(f->header->n_tags));
2424         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2425                 printf("Entry Array Objects: %"PRIu64"\n",
2426                        le64toh(f->header->n_entry_arrays));
2427
2428         if (fstat(f->fd, &st) >= 0)
2429                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2430 }
2431
2432 int journal_file_open(
2433                 const char *fname,
2434                 int flags,
2435                 mode_t mode,
2436                 bool compress,
2437                 bool seal,
2438                 JournalMetrics *metrics,
2439                 MMapCache *mmap_cache,
2440                 JournalFile *template,
2441                 JournalFile **ret) {
2442
2443         JournalFile *f;
2444         int r;
2445         bool newly_created = false;
2446
2447         assert(fname);
2448         assert(ret);
2449
2450         if ((flags & O_ACCMODE) != O_RDONLY &&
2451             (flags & O_ACCMODE) != O_RDWR)
2452                 return -EINVAL;
2453
2454         if (!endswith(fname, ".journal") &&
2455             !endswith(fname, ".journal~"))
2456                 return -EINVAL;
2457
2458         f = new0(JournalFile, 1);
2459         if (!f)
2460                 return -ENOMEM;
2461
2462         f->fd = -1;
2463         f->mode = mode;
2464
2465         f->flags = flags;
2466         f->prot = prot_from_flags(flags);
2467         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2468 #ifdef HAVE_XZ
2469         f->compress = compress;
2470 #endif
2471 #ifdef HAVE_GCRYPT
2472         f->seal = seal;
2473 #endif
2474
2475         if (mmap_cache)
2476                 f->mmap = mmap_cache_ref(mmap_cache);
2477         else {
2478                 f->mmap = mmap_cache_new();
2479                 if (!f->mmap) {
2480                         r = -ENOMEM;
2481                         goto fail;
2482                 }
2483         }
2484
2485         f->path = strdup(fname);
2486         if (!f->path) {
2487                 r = -ENOMEM;
2488                 goto fail;
2489         }
2490
2491         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2492         if (!f->chain_cache) {
2493                 r = -ENOMEM;
2494                 goto fail;
2495         }
2496
2497         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2498         if (f->fd < 0) {
2499                 r = -errno;
2500                 goto fail;
2501         }
2502
2503         if (fstat(f->fd, &f->last_stat) < 0) {
2504                 r = -errno;
2505                 goto fail;
2506         }
2507
2508         if (f->last_stat.st_size == 0 && f->writable) {
2509 #ifdef HAVE_XATTR
2510                 uint64_t crtime;
2511
2512                 /* Let's attach the creation time to the journal file,
2513                  * so that the vacuuming code knows the age of this
2514                  * file even if the file might end up corrupted one
2515                  * day... Ideally we'd just use the creation time many
2516                  * file systems maintain for each file, but there is
2517                  * currently no usable API to query this, hence let's
2518                  * emulate this via extended attributes. If extended
2519                  * attributes are not supported we'll just skip this,
2520                  * and rely solely on mtime/atime/ctime of the file.*/
2521
2522                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2523                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2524 #endif
2525
2526 #ifdef HAVE_GCRYPT
2527                 /* Try to load the FSPRG state, and if we can't, then
2528                  * just don't do sealing */
2529                 if (f->seal) {
2530                         r = journal_file_fss_load(f);
2531                         if (r < 0)
2532                                 f->seal = false;
2533                 }
2534 #endif
2535
2536                 r = journal_file_init_header(f, template);
2537                 if (r < 0)
2538                         goto fail;
2539
2540                 if (fstat(f->fd, &f->last_stat) < 0) {
2541                         r = -errno;
2542                         goto fail;
2543                 }
2544
2545                 newly_created = true;
2546         }
2547
2548         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2549                 r = -EIO;
2550                 goto fail;
2551         }
2552
2553         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2554         if (f->header == MAP_FAILED) {
2555                 f->header = NULL;
2556                 r = -errno;
2557                 goto fail;
2558         }
2559
2560         if (!newly_created) {
2561                 r = journal_file_verify_header(f);
2562                 if (r < 0)
2563                         goto fail;
2564         }
2565
2566 #ifdef HAVE_GCRYPT
2567         if (!newly_created && f->writable) {
2568                 r = journal_file_fss_load(f);
2569                 if (r < 0)
2570                         goto fail;
2571         }
2572 #endif
2573
2574         if (f->writable) {
2575                 if (metrics) {
2576                         journal_default_metrics(metrics, f->fd);
2577                         f->metrics = *metrics;
2578                 } else if (template)
2579                         f->metrics = template->metrics;
2580
2581                 r = journal_file_refresh_header(f);
2582                 if (r < 0)
2583                         goto fail;
2584         }
2585
2586 #ifdef HAVE_GCRYPT
2587         r = journal_file_hmac_setup(f);
2588         if (r < 0)
2589                 goto fail;
2590 #endif
2591
2592         if (newly_created) {
2593                 r = journal_file_setup_field_hash_table(f);
2594                 if (r < 0)
2595                         goto fail;
2596
2597                 r = journal_file_setup_data_hash_table(f);
2598                 if (r < 0)
2599                         goto fail;
2600
2601 #ifdef HAVE_GCRYPT
2602                 r = journal_file_append_first_tag(f);
2603                 if (r < 0)
2604                         goto fail;
2605 #endif
2606         }
2607
2608         r = journal_file_map_field_hash_table(f);
2609         if (r < 0)
2610                 goto fail;
2611
2612         r = journal_file_map_data_hash_table(f);
2613         if (r < 0)
2614                 goto fail;
2615
2616         *ret = f;
2617         return 0;
2618
2619 fail:
2620         journal_file_close(f);
2621
2622         return r;
2623 }
2624
2625 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2626         _cleanup_free_ char *p = NULL;
2627         size_t l;
2628         JournalFile *old_file, *new_file = NULL;
2629         int r;
2630
2631         assert(f);
2632         assert(*f);
2633
2634         old_file = *f;
2635
2636         if (!old_file->writable)
2637                 return -EINVAL;
2638
2639         if (!endswith(old_file->path, ".journal"))
2640                 return -EINVAL;
2641
2642         l = strlen(old_file->path);
2643         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2644                      (int) l - 8, old_file->path,
2645                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2646                      le64toh((*f)->header->head_entry_seqnum),
2647                      le64toh((*f)->header->head_entry_realtime));
2648         if (r < 0)
2649                 return -ENOMEM;
2650
2651         r = rename(old_file->path, p);
2652         if (r < 0)
2653                 return -errno;
2654
2655         old_file->header->state = STATE_ARCHIVED;
2656
2657         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2658         journal_file_close(old_file);
2659
2660         *f = new_file;
2661         return r;
2662 }
2663
2664 int journal_file_open_reliably(
2665                 const char *fname,
2666                 int flags,
2667                 mode_t mode,
2668                 bool compress,
2669                 bool seal,
2670                 JournalMetrics *metrics,
2671                 MMapCache *mmap_cache,
2672                 JournalFile *template,
2673                 JournalFile **ret) {
2674
2675         int r;
2676         size_t l;
2677         _cleanup_free_ char *p = NULL;
2678
2679         r = journal_file_open(fname, flags, mode, compress, seal,
2680                               metrics, mmap_cache, template, ret);
2681         if (r != -EBADMSG && /* corrupted */
2682             r != -ENODATA && /* truncated */
2683             r != -EHOSTDOWN && /* other machine */
2684             r != -EPROTONOSUPPORT && /* incompatible feature */
2685             r != -EBUSY && /* unclean shutdown */
2686             r != -ESHUTDOWN /* already archived */)
2687                 return r;
2688
2689         if ((flags & O_ACCMODE) == O_RDONLY)
2690                 return r;
2691
2692         if (!(flags & O_CREAT))
2693                 return r;
2694
2695         if (!endswith(fname, ".journal"))
2696                 return r;
2697
2698         /* The file is corrupted. Rotate it away and try it again (but only once) */
2699
2700         l = strlen(fname);
2701         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2702                      (int) l - 8, fname,
2703                      (unsigned long long) now(CLOCK_REALTIME),
2704                      random_ull()) < 0)
2705                 return -ENOMEM;
2706
2707         r = rename(fname, p);
2708         if (r < 0)
2709                 return -errno;
2710
2711         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2712
2713         return journal_file_open(fname, flags, mode, compress, seal,
2714                                  metrics, mmap_cache, template, ret);
2715 }
2716
2717 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2718         uint64_t i, n;
2719         uint64_t q, xor_hash = 0;
2720         int r;
2721         EntryItem *items;
2722         dual_timestamp ts;
2723
2724         assert(from);
2725         assert(to);
2726         assert(o);
2727         assert(p);
2728
2729         if (!to->writable)
2730                 return -EPERM;
2731
2732         ts.monotonic = le64toh(o->entry.monotonic);
2733         ts.realtime = le64toh(o->entry.realtime);
2734
2735         n = journal_file_entry_n_items(o);
2736         /* alloca() can't take 0, hence let's allocate at least one */
2737         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2738
2739         for (i = 0; i < n; i++) {
2740                 uint64_t l, h;
2741                 le64_t le_hash;
2742                 size_t t;
2743                 void *data;
2744                 Object *u;
2745
2746                 q = le64toh(o->entry.items[i].object_offset);
2747                 le_hash = o->entry.items[i].hash;
2748
2749                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2750                 if (r < 0)
2751                         return r;
2752
2753                 if (le_hash != o->data.hash)
2754                         return -EBADMSG;
2755
2756                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2757                 t = (size_t) l;
2758
2759                 /* We hit the limit on 32bit machines */
2760                 if ((uint64_t) t != l)
2761                         return -E2BIG;
2762
2763                 if (o->object.flags & OBJECT_COMPRESSED) {
2764 #ifdef HAVE_XZ
2765                         uint64_t rsize;
2766
2767                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2768                                 return -EBADMSG;
2769
2770                         data = from->compress_buffer;
2771                         l = rsize;
2772 #else
2773                         return -EPROTONOSUPPORT;
2774 #endif
2775                 } else
2776                         data = o->data.payload;
2777
2778                 r = journal_file_append_data(to, data, l, &u, &h);
2779                 if (r < 0)
2780                         return r;
2781
2782                 xor_hash ^= le64toh(u->data.hash);
2783                 items[i].object_offset = htole64(h);
2784                 items[i].hash = u->data.hash;
2785
2786                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2787                 if (r < 0)
2788                         return r;
2789         }
2790
2791         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2792 }
2793
2794 void journal_default_metrics(JournalMetrics *m, int fd) {
2795         uint64_t fs_size = 0;
2796         struct statvfs ss;
2797         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2798
2799         assert(m);
2800         assert(fd >= 0);
2801
2802         if (fstatvfs(fd, &ss) >= 0)
2803                 fs_size = ss.f_frsize * ss.f_blocks;
2804
2805         if (m->max_use == (uint64_t) -1) {
2806
2807                 if (fs_size > 0) {
2808                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2809
2810                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2811                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2812
2813                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2814                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2815                 } else
2816                         m->max_use = DEFAULT_MAX_USE_LOWER;
2817         } else {
2818                 m->max_use = PAGE_ALIGN(m->max_use);
2819
2820                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2821                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2822         }
2823
2824         if (m->max_size == (uint64_t) -1) {
2825                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2826
2827                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2828                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2829         } else
2830                 m->max_size = PAGE_ALIGN(m->max_size);
2831
2832         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2833                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2834
2835         if (m->max_size*2 > m->max_use)
2836                 m->max_use = m->max_size*2;
2837
2838         if (m->min_size == (uint64_t) -1)
2839                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2840         else {
2841                 m->min_size = PAGE_ALIGN(m->min_size);
2842
2843                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2844                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2845
2846                 if (m->min_size > m->max_size)
2847                         m->max_size = m->min_size;
2848         }
2849
2850         if (m->keep_free == (uint64_t) -1) {
2851
2852                 if (fs_size > 0) {
2853                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2854
2855                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2856                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2857
2858                 } else
2859                         m->keep_free = DEFAULT_KEEP_FREE;
2860         }
2861
2862         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2863                   format_bytes(a, sizeof(a), m->max_use),
2864                   format_bytes(b, sizeof(b), m->max_size),
2865                   format_bytes(c, sizeof(c), m->min_size),
2866                   format_bytes(d, sizeof(d), m->keep_free));
2867 }
2868
2869 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2870         assert(f);
2871         assert(from || to);
2872
2873         if (from) {
2874                 if (f->header->head_entry_realtime == 0)
2875                         return -ENOENT;
2876
2877                 *from = le64toh(f->header->head_entry_realtime);
2878         }
2879
2880         if (to) {
2881                 if (f->header->tail_entry_realtime == 0)
2882                         return -ENOENT;
2883
2884                 *to = le64toh(f->header->tail_entry_realtime);
2885         }
2886
2887         return 1;
2888 }
2889
2890 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2891         Object *o;
2892         uint64_t p;
2893         int r;
2894
2895         assert(f);
2896         assert(from || to);
2897
2898         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2899         if (r <= 0)
2900                 return r;
2901
2902         if (le64toh(o->data.n_entries) <= 0)
2903                 return 0;
2904
2905         if (from) {
2906                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2907                 if (r < 0)
2908                         return r;
2909
2910                 *from = le64toh(o->entry.monotonic);
2911         }
2912
2913         if (to) {
2914                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2915                 if (r < 0)
2916                         return r;
2917
2918                 r = generic_array_get_plus_one(f,
2919                                                le64toh(o->data.entry_offset),
2920                                                le64toh(o->data.entry_array_offset),
2921                                                le64toh(o->data.n_entries)-1,
2922                                                &o, NULL);
2923                 if (r <= 0)
2924                         return r;
2925
2926                 *to = le64toh(o->entry.monotonic);
2927         }
2928
2929         return 1;
2930 }
2931
2932 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2933         assert(f);
2934
2935         /* If we gained new header fields we gained new features,
2936          * hence suggest a rotation */
2937         if (le64toh(f->header->header_size) < sizeof(Header)) {
2938                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2939                 return true;
2940         }
2941
2942         /* Let's check if the hash tables grew over a certain fill
2943          * level (75%, borrowing this value from Java's hash table
2944          * implementation), and if so suggest a rotation. To calculate
2945          * the fill level we need the n_data field, which only exists
2946          * in newer versions. */
2947
2948         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2949                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2950                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2951                                   f->path,
2952                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2953                                   le64toh(f->header->n_data),
2954                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2955                                   (unsigned long long) f->last_stat.st_size,
2956                                   f->last_stat.st_size / le64toh(f->header->n_data));
2957                         return true;
2958                 }
2959
2960         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2961                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2962                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2963                                   f->path,
2964                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2965                                   le64toh(f->header->n_fields),
2966                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2967                         return true;
2968                 }
2969
2970         /* Are the data objects properly indexed by field objects? */
2971         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2972             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2973             le64toh(f->header->n_data) > 0 &&
2974             le64toh(f->header->n_fields) == 0)
2975                 return true;
2976
2977         if (max_file_usec > 0) {
2978                 usec_t t, h;
2979
2980                 h = le64toh(f->header->head_entry_realtime);
2981                 t = now(CLOCK_REALTIME);
2982
2983                 if (h > 0 && t > h + max_file_usec)
2984                         return true;
2985         }
2986
2987         return false;
2988 }