chiark / gitweb /
build-sys: add sd_j_open_container manpage alias
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75         assert(f);
76
77         if (!f->writable)
78                 return -EPERM;
79
80         if (!(f->fd >= 0 && f->header))
81                 return -EINVAL;
82
83         switch(f->header->state) {
84                 case STATE_ONLINE:
85                         return 0;
86
87                 case STATE_OFFLINE:
88                         f->header->state = STATE_ONLINE;
89                         fsync(f->fd);
90                         return 0;
91
92                 default:
93                         return -EINVAL;
94         }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98         assert(f);
99
100         if (!f->writable)
101                 return -EPERM;
102
103         if (!(f->fd >= 0 && f->header))
104                 return -EINVAL;
105
106         if (f->header->state != STATE_ONLINE)
107                 return 0;
108
109         fsync(f->fd);
110
111         f->header->state = STATE_OFFLINE;
112
113         fsync(f->fd);
114
115         return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119         assert(f);
120
121 #ifdef HAVE_GCRYPT
122         /* Write the final tag */
123         if (f->seal && f->writable)
124                 journal_file_append_tag(f);
125 #endif
126
127         /* Sync everything to disk, before we mark the file offline */
128         if (f->mmap && f->fd >= 0)
129                 mmap_cache_close_fd(f->mmap, f->fd);
130
131         journal_file_set_offline(f);
132
133         if (f->header)
134                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136         if (f->fd >= 0)
137                 close_nointr_nofail(f->fd);
138
139         free(f->path);
140
141         if (f->mmap)
142                 mmap_cache_unref(f->mmap);
143
144         hashmap_free_free(f->chain_cache);
145
146 #ifdef HAVE_XZ
147         free(f->compress_buffer);
148 #endif
149
150 #ifdef HAVE_GCRYPT
151         if (f->fss_file)
152                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153         else if (f->fsprg_state)
154                 free(f->fsprg_state);
155
156         free(f->fsprg_seed);
157
158         if (f->hmac)
159                 gcry_md_close(f->hmac);
160 #endif
161
162         free(f);
163 }
164
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166         Header h;
167         ssize_t k;
168         int r;
169
170         assert(f);
171
172         zero(h);
173         memcpy(h.signature, HEADER_SIGNATURE, 8);
174         h.header_size = htole64(ALIGN64(sizeof(h)));
175
176         h.incompatible_flags =
177                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179         h.compatible_flags =
180                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
181
182         r = sd_id128_randomize(&h.file_id);
183         if (r < 0)
184                 return r;
185
186         if (template) {
187                 h.seqnum_id = template->header->seqnum_id;
188                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
189         } else
190                 h.seqnum_id = h.file_id;
191
192         k = pwrite(f->fd, &h, sizeof(h), 0);
193         if (k < 0)
194                 return -errno;
195
196         if (k != sizeof(h))
197                 return -EIO;
198
199         return 0;
200 }
201
202 static int journal_file_refresh_header(JournalFile *f) {
203         int r;
204         sd_id128_t boot_id;
205
206         assert(f);
207
208         r = sd_id128_get_machine(&f->header->machine_id);
209         if (r < 0)
210                 return r;
211
212         r = sd_id128_get_boot(&boot_id);
213         if (r < 0)
214                 return r;
215
216         if (sd_id128_equal(boot_id, f->header->boot_id))
217                 f->tail_entry_monotonic_valid = true;
218
219         f->header->boot_id = boot_id;
220
221         journal_file_set_online(f);
222
223         /* Sync the online state to disk */
224         fsync(f->fd);
225
226         return 0;
227 }
228
229 static int journal_file_verify_header(JournalFile *f) {
230         assert(f);
231
232         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233                 return -EBADMSG;
234
235         /* In both read and write mode we refuse to open files with
236          * incompatible flags we don't know */
237 #ifdef HAVE_XZ
238         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239                 return -EPROTONOSUPPORT;
240 #else
241         if (f->header->incompatible_flags != 0)
242                 return -EPROTONOSUPPORT;
243 #endif
244
245         /* When open for writing we refuse to open files with
246          * compatible flags, too */
247         if (f->writable) {
248 #ifdef HAVE_GCRYPT
249                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250                         return -EPROTONOSUPPORT;
251 #else
252                 if (f->header->compatible_flags != 0)
253                         return -EPROTONOSUPPORT;
254 #endif
255         }
256
257         if (f->header->state >= _STATE_MAX)
258                 return -EBADMSG;
259
260         /* The first addition was n_data, so check that we are at least this large */
261         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262                 return -EBADMSG;
263
264         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265                 return -EBADMSG;
266
267         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268                 return -ENODATA;
269
270         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271                 return -ENODATA;
272
273         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275             !VALID64(le64toh(f->header->tail_object_offset)) ||
276             !VALID64(le64toh(f->header->entry_array_offset)))
277                 return -ENODATA;
278
279         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
283                 return -ENODATA;
284
285         if (f->writable) {
286                 uint8_t state;
287                 sd_id128_t machine_id;
288                 int r;
289
290                 r = sd_id128_get_machine(&machine_id);
291                 if (r < 0)
292                         return r;
293
294                 if (!sd_id128_equal(machine_id, f->header->machine_id))
295                         return -EHOSTDOWN;
296
297                 state = f->header->state;
298
299                 if (state == STATE_ONLINE) {
300                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301                         return -EBUSY;
302                 } else if (state == STATE_ARCHIVED)
303                         return -ESHUTDOWN;
304                 else if (state != STATE_OFFLINE) {
305                         log_debug("Journal file %s has unknown state %u.", f->path, state);
306                         return -EBUSY;
307                 }
308         }
309
310         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
311
312         f->seal = JOURNAL_HEADER_SEALED(f->header);
313
314         return 0;
315 }
316
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318         uint64_t old_size, new_size;
319         int r;
320
321         assert(f);
322
323         /* We assume that this file is not sparse, and we know that
324          * for sure, since we always call posix_fallocate()
325          * ourselves */
326
327         old_size =
328                 le64toh(f->header->header_size) +
329                 le64toh(f->header->arena_size);
330
331         new_size = PAGE_ALIGN(offset + size);
332         if (new_size < le64toh(f->header->header_size))
333                 new_size = le64toh(f->header->header_size);
334
335         if (new_size <= old_size)
336                 return 0;
337
338         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339                 return -E2BIG;
340
341         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Increase by larger blocks at once */
360         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362                 new_size = f->metrics.max_size;
363
364         /* Note that the glibc fallocate() fallback is very
365            inefficient, hence we try to minimize the allocation area
366            as we can. */
367         r = posix_fallocate(f->fd, old_size, new_size - old_size);
368         if (r != 0)
369                 return -r;
370
371         if (fstat(f->fd, &f->last_stat) < 0)
372                 return -errno;
373
374         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
375
376         return 0;
377 }
378
379 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
380         assert(f);
381         assert(ret);
382
383         if (size <= 0)
384                 return -EINVAL;
385
386         /* Avoid SIGBUS on invalid accesses */
387         if (offset + size > (uint64_t) f->last_stat.st_size) {
388                 /* Hmm, out of range? Let's refresh the fstat() data
389                  * first, before we trust that check. */
390
391                 if (fstat(f->fd, &f->last_stat) < 0 ||
392                     offset + size > (uint64_t) f->last_stat.st_size)
393                         return -EADDRNOTAVAIL;
394         }
395
396         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 }
398
399 static uint64_t minimum_header_size(Object *o) {
400
401         static const uint64_t table[] = {
402                 [OBJECT_DATA] = sizeof(DataObject),
403                 [OBJECT_FIELD] = sizeof(FieldObject),
404                 [OBJECT_ENTRY] = sizeof(EntryObject),
405                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408                 [OBJECT_TAG] = sizeof(TagObject),
409         };
410
411         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412                 return sizeof(ObjectHeader);
413
414         return table[o->object.type];
415 }
416
417 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
418         int r;
419         void *t;
420         Object *o;
421         uint64_t s;
422         unsigned context;
423
424         assert(f);
425         assert(ret);
426
427         /* Objects may only be located at multiple of 64 bit */
428         if (!VALID64(offset))
429                 return -EFAULT;
430
431         /* One context for each type, plus one catch-all for the rest */
432         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
433
434         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
435         if (r < 0)
436                 return r;
437
438         o = (Object*) t;
439         s = le64toh(o->object.size);
440
441         if (s < sizeof(ObjectHeader))
442                 return -EBADMSG;
443
444         if (o->object.type <= OBJECT_UNUSED)
445                 return -EBADMSG;
446
447         if (s < minimum_header_size(o))
448                 return -EBADMSG;
449
450         if (type > 0 && o->object.type != type)
451                 return -EBADMSG;
452
453         if (s > sizeof(ObjectHeader)) {
454                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
455                 if (r < 0)
456                         return r;
457
458                 o = (Object*) t;
459         }
460
461         *ret = o;
462         return 0;
463 }
464
465 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
466         uint64_t r;
467
468         assert(f);
469
470         r = le64toh(f->header->tail_entry_seqnum) + 1;
471
472         if (seqnum) {
473                 /* If an external seqnum counter was passed, we update
474                  * both the local and the external one, and set it to
475                  * the maximum of both */
476
477                 if (*seqnum + 1 > r)
478                         r = *seqnum + 1;
479
480                 *seqnum = r;
481         }
482
483         f->header->tail_entry_seqnum = htole64(r);
484
485         if (f->header->head_entry_seqnum == 0)
486                 f->header->head_entry_seqnum = htole64(r);
487
488         return r;
489 }
490
491 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
492         int r;
493         uint64_t p;
494         Object *tail, *o;
495         void *t;
496
497         assert(f);
498         assert(type > 0 && type < _OBJECT_TYPE_MAX);
499         assert(size >= sizeof(ObjectHeader));
500         assert(offset);
501         assert(ret);
502
503         r = journal_file_set_online(f);
504         if (r < 0)
505                 return r;
506
507         p = le64toh(f->header->tail_object_offset);
508         if (p == 0)
509                 p = le64toh(f->header->header_size);
510         else {
511                 r = journal_file_move_to_object(f, -1, p, &tail);
512                 if (r < 0)
513                         return r;
514
515                 p += ALIGN64(le64toh(tail->object.size));
516         }
517
518         r = journal_file_allocate(f, p, size);
519         if (r < 0)
520                 return r;
521
522         r = journal_file_move_to(f, type, false, p, size, &t);
523         if (r < 0)
524                 return r;
525
526         o = (Object*) t;
527
528         zero(o->object);
529         o->object.type = type;
530         o->object.size = htole64(size);
531
532         f->header->tail_object_offset = htole64(p);
533         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
534
535         *ret = o;
536         *offset = p;
537
538         return 0;
539 }
540
541 static int journal_file_setup_data_hash_table(JournalFile *f) {
542         uint64_t s, p;
543         Object *o;
544         int r;
545
546         assert(f);
547
548         /* We estimate that we need 1 hash table entry per 768 of
549            journal file and we want to make sure we never get beyond
550            75% fill level. Calculate the hash table size for the
551            maximum file size based on these metrics. */
552
553         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
554         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
556
557         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
558
559         r = journal_file_append_object(f,
560                                        OBJECT_DATA_HASH_TABLE,
561                                        offsetof(Object, hash_table.items) + s,
562                                        &o, &p);
563         if (r < 0)
564                 return r;
565
566         memset(o->hash_table.items, 0, s);
567
568         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569         f->header->data_hash_table_size = htole64(s);
570
571         return 0;
572 }
573
574 static int journal_file_setup_field_hash_table(JournalFile *f) {
575         uint64_t s, p;
576         Object *o;
577         int r;
578
579         assert(f);
580
581         /* We use a fixed size hash table for the fields as this
582          * number should grow very slowly only */
583
584         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585         r = journal_file_append_object(f,
586                                        OBJECT_FIELD_HASH_TABLE,
587                                        offsetof(Object, hash_table.items) + s,
588                                        &o, &p);
589         if (r < 0)
590                 return r;
591
592         memset(o->hash_table.items, 0, s);
593
594         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595         f->header->field_hash_table_size = htole64(s);
596
597         return 0;
598 }
599
600 static int journal_file_map_data_hash_table(JournalFile *f) {
601         uint64_t s, p;
602         void *t;
603         int r;
604
605         assert(f);
606
607         p = le64toh(f->header->data_hash_table_offset);
608         s = le64toh(f->header->data_hash_table_size);
609
610         r = journal_file_move_to(f,
611                                  OBJECT_DATA_HASH_TABLE,
612                                  true,
613                                  p, s,
614                                  &t);
615         if (r < 0)
616                 return r;
617
618         f->data_hash_table = t;
619         return 0;
620 }
621
622 static int journal_file_map_field_hash_table(JournalFile *f) {
623         uint64_t s, p;
624         void *t;
625         int r;
626
627         assert(f);
628
629         p = le64toh(f->header->field_hash_table_offset);
630         s = le64toh(f->header->field_hash_table_size);
631
632         r = journal_file_move_to(f,
633                                  OBJECT_FIELD_HASH_TABLE,
634                                  true,
635                                  p, s,
636                                  &t);
637         if (r < 0)
638                 return r;
639
640         f->field_hash_table = t;
641         return 0;
642 }
643
644 static int journal_file_link_field(
645                 JournalFile *f,
646                 Object *o,
647                 uint64_t offset,
648                 uint64_t hash) {
649
650         uint64_t p, h;
651         int r;
652
653         assert(f);
654         assert(o);
655         assert(offset > 0);
656
657         if (o->object.type != OBJECT_FIELD)
658                 return -EINVAL;
659
660         /* This might alter the window we are looking at */
661
662         o->field.next_hash_offset = o->field.head_data_offset = 0;
663
664         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665         p = le64toh(f->field_hash_table[h].tail_hash_offset);
666         if (p == 0)
667                 f->field_hash_table[h].head_hash_offset = htole64(offset);
668         else {
669                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
670                 if (r < 0)
671                         return r;
672
673                 o->field.next_hash_offset = htole64(offset);
674         }
675
676         f->field_hash_table[h].tail_hash_offset = htole64(offset);
677
678         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
680
681         return 0;
682 }
683
684 static int journal_file_link_data(
685                 JournalFile *f,
686                 Object *o,
687                 uint64_t offset,
688                 uint64_t hash) {
689
690         uint64_t p, h;
691         int r;
692
693         assert(f);
694         assert(o);
695         assert(offset > 0);
696
697         if (o->object.type != OBJECT_DATA)
698                 return -EINVAL;
699
700         /* This might alter the window we are looking at */
701
702         o->data.next_hash_offset = o->data.next_field_offset = 0;
703         o->data.entry_offset = o->data.entry_array_offset = 0;
704         o->data.n_entries = 0;
705
706         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
707         p = le64toh(f->data_hash_table[h].tail_hash_offset);
708         if (p == 0)
709                 /* Only entry in the hash table is easy */
710                 f->data_hash_table[h].head_hash_offset = htole64(offset);
711         else {
712                 /* Move back to the previous data object, to patch in
713                  * pointer */
714
715                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
716                 if (r < 0)
717                         return r;
718
719                 o->data.next_hash_offset = htole64(offset);
720         }
721
722         f->data_hash_table[h].tail_hash_offset = htole64(offset);
723
724         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
726
727         return 0;
728 }
729
730 int journal_file_find_field_object_with_hash(
731                 JournalFile *f,
732                 const void *field, uint64_t size, uint64_t hash,
733                 Object **ret, uint64_t *offset) {
734
735         uint64_t p, osize, h;
736         int r;
737
738         assert(f);
739         assert(field && size > 0);
740
741         osize = offsetof(Object, field.payload) + size;
742
743         if (f->header->field_hash_table_size == 0)
744                 return -EBADMSG;
745
746         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747         p = le64toh(f->field_hash_table[h].head_hash_offset);
748
749         while (p > 0) {
750                 Object *o;
751
752                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
753                 if (r < 0)
754                         return r;
755
756                 if (le64toh(o->field.hash) == hash &&
757                     le64toh(o->object.size) == osize &&
758                     memcmp(o->field.payload, field, size) == 0) {
759
760                         if (ret)
761                                 *ret = o;
762                         if (offset)
763                                 *offset = p;
764
765                         return 1;
766                 }
767
768                 p = le64toh(o->field.next_hash_offset);
769         }
770
771         return 0;
772 }
773
774 int journal_file_find_field_object(
775                 JournalFile *f,
776                 const void *field, uint64_t size,
777                 Object **ret, uint64_t *offset) {
778
779         uint64_t hash;
780
781         assert(f);
782         assert(field && size > 0);
783
784         hash = hash64(field, size);
785
786         return journal_file_find_field_object_with_hash(f,
787                                                         field, size, hash,
788                                                         ret, offset);
789 }
790
791 int journal_file_find_data_object_with_hash(
792                 JournalFile *f,
793                 const void *data, uint64_t size, uint64_t hash,
794                 Object **ret, uint64_t *offset) {
795
796         uint64_t p, osize, h;
797         int r;
798
799         assert(f);
800         assert(data || size == 0);
801
802         osize = offsetof(Object, data.payload) + size;
803
804         if (f->header->data_hash_table_size == 0)
805                 return -EBADMSG;
806
807         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808         p = le64toh(f->data_hash_table[h].head_hash_offset);
809
810         while (p > 0) {
811                 Object *o;
812
813                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
814                 if (r < 0)
815                         return r;
816
817                 if (le64toh(o->data.hash) != hash)
818                         goto next;
819
820                 if (o->object.flags & OBJECT_COMPRESSED) {
821 #ifdef HAVE_XZ
822                         uint64_t l, rsize;
823
824                         l = le64toh(o->object.size);
825                         if (l <= offsetof(Object, data.payload))
826                                 return -EBADMSG;
827
828                         l -= offsetof(Object, data.payload);
829
830                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
831                                 return -EBADMSG;
832
833                         if (rsize == size &&
834                             memcmp(f->compress_buffer, data, size) == 0) {
835
836                                 if (ret)
837                                         *ret = o;
838
839                                 if (offset)
840                                         *offset = p;
841
842                                 return 1;
843                         }
844 #else
845                         return -EPROTONOSUPPORT;
846 #endif
847
848                 } else if (le64toh(o->object.size) == osize &&
849                            memcmp(o->data.payload, data, size) == 0) {
850
851                         if (ret)
852                                 *ret = o;
853
854                         if (offset)
855                                 *offset = p;
856
857                         return 1;
858                 }
859
860         next:
861                 p = le64toh(o->data.next_hash_offset);
862         }
863
864         return 0;
865 }
866
867 int journal_file_find_data_object(
868                 JournalFile *f,
869                 const void *data, uint64_t size,
870                 Object **ret, uint64_t *offset) {
871
872         uint64_t hash;
873
874         assert(f);
875         assert(data || size == 0);
876
877         hash = hash64(data, size);
878
879         return journal_file_find_data_object_with_hash(f,
880                                                        data, size, hash,
881                                                        ret, offset);
882 }
883
884 static int journal_file_append_field(
885                 JournalFile *f,
886                 const void *field, uint64_t size,
887                 Object **ret, uint64_t *offset) {
888
889         uint64_t hash, p;
890         uint64_t osize;
891         Object *o;
892         int r;
893
894         assert(f);
895         assert(field && size > 0);
896
897         hash = hash64(field, size);
898
899         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
900         if (r < 0)
901                 return r;
902         else if (r > 0) {
903
904                 if (ret)
905                         *ret = o;
906
907                 if (offset)
908                         *offset = p;
909
910                 return 0;
911         }
912
913         osize = offsetof(Object, field.payload) + size;
914         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
915         if (r < 0)
916                 return r;
917
918         o->field.hash = htole64(hash);
919         memcpy(o->field.payload, field, size);
920
921         r = journal_file_link_field(f, o, p, hash);
922         if (r < 0)
923                 return r;
924
925         /* The linking might have altered the window, so let's
926          * refresh our pointer */
927         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
928         if (r < 0)
929                 return r;
930
931 #ifdef HAVE_GCRYPT
932         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
933         if (r < 0)
934                 return r;
935 #endif
936
937         if (ret)
938                 *ret = o;
939
940         if (offset)
941                 *offset = p;
942
943         return 0;
944 }
945
946 static int journal_file_append_data(
947                 JournalFile *f,
948                 const void *data, uint64_t size,
949                 Object **ret, uint64_t *offset) {
950
951         uint64_t hash, p;
952         uint64_t osize;
953         Object *o;
954         int r;
955         bool compressed = false;
956         const void *eq;
957
958         assert(f);
959         assert(data || size == 0);
960
961         hash = hash64(data, size);
962
963         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
964         if (r < 0)
965                 return r;
966         else if (r > 0) {
967
968                 if (ret)
969                         *ret = o;
970
971                 if (offset)
972                         *offset = p;
973
974                 return 0;
975         }
976
977         osize = offsetof(Object, data.payload) + size;
978         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
979         if (r < 0)
980                 return r;
981
982         o->data.hash = htole64(hash);
983
984 #ifdef HAVE_XZ
985         if (f->compress &&
986             size >= COMPRESSION_SIZE_THRESHOLD) {
987                 uint64_t rsize;
988
989                 compressed = compress_blob(data, size, o->data.payload, &rsize);
990
991                 if (compressed) {
992                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
993                         o->object.flags |= OBJECT_COMPRESSED;
994
995                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
996                 }
997         }
998 #endif
999
1000         if (!compressed && size > 0)
1001                 memcpy(o->data.payload, data, size);
1002
1003         r = journal_file_link_data(f, o, p, hash);
1004         if (r < 0)
1005                 return r;
1006
1007         /* The linking might have altered the window, so let's
1008          * refresh our pointer */
1009         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1010         if (r < 0)
1011                 return r;
1012
1013         if (!data)
1014                 eq = NULL;
1015         else
1016                 eq = memchr(data, '=', size);
1017         if (eq && eq > data) {
1018                 uint64_t fp;
1019                 Object *fo;
1020
1021                 /* Create field object ... */
1022                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1023                 if (r < 0)
1024                         return r;
1025
1026                 /* ... and link it in. */
1027                 o->data.next_field_offset = fo->field.head_data_offset;
1028                 fo->field.head_data_offset = le64toh(p);
1029         }
1030
1031 #ifdef HAVE_GCRYPT
1032         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1033         if (r < 0)
1034                 return r;
1035 #endif
1036
1037         if (ret)
1038                 *ret = o;
1039
1040         if (offset)
1041                 *offset = p;
1042
1043         return 0;
1044 }
1045
1046 uint64_t journal_file_entry_n_items(Object *o) {
1047         assert(o);
1048
1049         if (o->object.type != OBJECT_ENTRY)
1050                 return 0;
1051
1052         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1053 }
1054
1055 uint64_t journal_file_entry_array_n_items(Object *o) {
1056         assert(o);
1057
1058         if (o->object.type != OBJECT_ENTRY_ARRAY)
1059                 return 0;
1060
1061         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1062 }
1063
1064 uint64_t journal_file_hash_table_n_items(Object *o) {
1065         assert(o);
1066
1067         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068             o->object.type != OBJECT_FIELD_HASH_TABLE)
1069                 return 0;
1070
1071         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1072 }
1073
1074 static int link_entry_into_array(JournalFile *f,
1075                                  le64_t *first,
1076                                  le64_t *idx,
1077                                  uint64_t p) {
1078         int r;
1079         uint64_t n = 0, ap = 0, q, i, a, hidx;
1080         Object *o;
1081
1082         assert(f);
1083         assert(first);
1084         assert(idx);
1085         assert(p > 0);
1086
1087         a = le64toh(*first);
1088         i = hidx = le64toh(*idx);
1089         while (a > 0) {
1090
1091                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1092                 if (r < 0)
1093                         return r;
1094
1095                 n = journal_file_entry_array_n_items(o);
1096                 if (i < n) {
1097                         o->entry_array.items[i] = htole64(p);
1098                         *idx = htole64(hidx + 1);
1099                         return 0;
1100                 }
1101
1102                 i -= n;
1103                 ap = a;
1104                 a = le64toh(o->entry_array.next_entry_array_offset);
1105         }
1106
1107         if (hidx > n)
1108                 n = (hidx+1) * 2;
1109         else
1110                 n = n * 2;
1111
1112         if (n < 4)
1113                 n = 4;
1114
1115         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1117                                        &o, &q);
1118         if (r < 0)
1119                 return r;
1120
1121 #ifdef HAVE_GCRYPT
1122         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1123         if (r < 0)
1124                 return r;
1125 #endif
1126
1127         o->entry_array.items[i] = htole64(p);
1128
1129         if (ap == 0)
1130                 *first = htole64(q);
1131         else {
1132                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1133                 if (r < 0)
1134                         return r;
1135
1136                 o->entry_array.next_entry_array_offset = htole64(q);
1137         }
1138
1139         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1141
1142         *idx = htole64(hidx + 1);
1143
1144         return 0;
1145 }
1146
1147 static int link_entry_into_array_plus_one(JournalFile *f,
1148                                           le64_t *extra,
1149                                           le64_t *first,
1150                                           le64_t *idx,
1151                                           uint64_t p) {
1152
1153         int r;
1154
1155         assert(f);
1156         assert(extra);
1157         assert(first);
1158         assert(idx);
1159         assert(p > 0);
1160
1161         if (*idx == 0)
1162                 *extra = htole64(p);
1163         else {
1164                 le64_t i;
1165
1166                 i = htole64(le64toh(*idx) - 1);
1167                 r = link_entry_into_array(f, first, &i, p);
1168                 if (r < 0)
1169                         return r;
1170         }
1171
1172         *idx = htole64(le64toh(*idx) + 1);
1173         return 0;
1174 }
1175
1176 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1177         uint64_t p;
1178         int r;
1179         assert(f);
1180         assert(o);
1181         assert(offset > 0);
1182
1183         p = le64toh(o->entry.items[i].object_offset);
1184         if (p == 0)
1185                 return -EINVAL;
1186
1187         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1188         if (r < 0)
1189                 return r;
1190
1191         return link_entry_into_array_plus_one(f,
1192                                               &o->data.entry_offset,
1193                                               &o->data.entry_array_offset,
1194                                               &o->data.n_entries,
1195                                               offset);
1196 }
1197
1198 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1199         uint64_t n, i;
1200         int r;
1201
1202         assert(f);
1203         assert(o);
1204         assert(offset > 0);
1205
1206         if (o->object.type != OBJECT_ENTRY)
1207                 return -EINVAL;
1208
1209         __sync_synchronize();
1210
1211         /* Link up the entry itself */
1212         r = link_entry_into_array(f,
1213                                   &f->header->entry_array_offset,
1214                                   &f->header->n_entries,
1215                                   offset);
1216         if (r < 0)
1217                 return r;
1218
1219         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1220
1221         if (f->header->head_entry_realtime == 0)
1222                 f->header->head_entry_realtime = o->entry.realtime;
1223
1224         f->header->tail_entry_realtime = o->entry.realtime;
1225         f->header->tail_entry_monotonic = o->entry.monotonic;
1226
1227         f->tail_entry_monotonic_valid = true;
1228
1229         /* Link up the items */
1230         n = journal_file_entry_n_items(o);
1231         for (i = 0; i < n; i++) {
1232                 r = journal_file_link_entry_item(f, o, offset, i);
1233                 if (r < 0)
1234                         return r;
1235         }
1236
1237         return 0;
1238 }
1239
1240 static int journal_file_append_entry_internal(
1241                 JournalFile *f,
1242                 const dual_timestamp *ts,
1243                 uint64_t xor_hash,
1244                 const EntryItem items[], unsigned n_items,
1245                 uint64_t *seqnum,
1246                 Object **ret, uint64_t *offset) {
1247         uint64_t np;
1248         uint64_t osize;
1249         Object *o;
1250         int r;
1251
1252         assert(f);
1253         assert(items || n_items == 0);
1254         assert(ts);
1255
1256         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1257
1258         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1259         if (r < 0)
1260                 return r;
1261
1262         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1263         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1264         o->entry.realtime = htole64(ts->realtime);
1265         o->entry.monotonic = htole64(ts->monotonic);
1266         o->entry.xor_hash = htole64(xor_hash);
1267         o->entry.boot_id = f->header->boot_id;
1268
1269 #ifdef HAVE_GCRYPT
1270         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1271         if (r < 0)
1272                 return r;
1273 #endif
1274
1275         r = journal_file_link_entry(f, o, np);
1276         if (r < 0)
1277                 return r;
1278
1279         if (ret)
1280                 *ret = o;
1281
1282         if (offset)
1283                 *offset = np;
1284
1285         return 0;
1286 }
1287
1288 void journal_file_post_change(JournalFile *f) {
1289         assert(f);
1290
1291         /* inotify() does not receive IN_MODIFY events from file
1292          * accesses done via mmap(). After each access we hence
1293          * trigger IN_MODIFY by truncating the journal file to its
1294          * current size which triggers IN_MODIFY. */
1295
1296         __sync_synchronize();
1297
1298         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1299                 log_error("Failed to truncate file to its own size: %m");
1300 }
1301
1302 static int entry_item_cmp(const void *_a, const void *_b) {
1303         const EntryItem *a = _a, *b = _b;
1304
1305         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1306                 return -1;
1307         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1308                 return 1;
1309         return 0;
1310 }
1311
1312 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1313         unsigned i;
1314         EntryItem *items;
1315         int r;
1316         uint64_t xor_hash = 0;
1317         struct dual_timestamp _ts;
1318
1319         assert(f);
1320         assert(iovec || n_iovec == 0);
1321
1322         if (!ts) {
1323                 dual_timestamp_get(&_ts);
1324                 ts = &_ts;
1325         }
1326
1327         if (f->tail_entry_monotonic_valid &&
1328             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1329                 return -EINVAL;
1330
1331 #ifdef HAVE_GCRYPT
1332         r = journal_file_maybe_append_tag(f, ts->realtime);
1333         if (r < 0)
1334                 return r;
1335 #endif
1336
1337         /* alloca() can't take 0, hence let's allocate at least one */
1338         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1339
1340         for (i = 0; i < n_iovec; i++) {
1341                 uint64_t p;
1342                 Object *o;
1343
1344                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1345                 if (r < 0)
1346                         return r;
1347
1348                 xor_hash ^= le64toh(o->data.hash);
1349                 items[i].object_offset = htole64(p);
1350                 items[i].hash = o->data.hash;
1351         }
1352
1353         /* Order by the position on disk, in order to improve seek
1354          * times for rotating media. */
1355         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1356
1357         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1358
1359         journal_file_post_change(f);
1360
1361         return r;
1362 }
1363
1364 typedef struct ChainCacheItem {
1365         uint64_t first; /* the array at the begin of the chain */
1366         uint64_t array; /* the cached array */
1367         uint64_t begin; /* the first item in the cached array */
1368         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1369         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1370 } ChainCacheItem;
1371
1372 static void chain_cache_put(
1373                 Hashmap *h,
1374                 ChainCacheItem *ci,
1375                 uint64_t first,
1376                 uint64_t array,
1377                 uint64_t begin,
1378                 uint64_t total,
1379                 uint64_t last_index) {
1380
1381         if (!ci) {
1382                 /* If the chain item to cache for this chain is the
1383                  * first one it's not worth caching anything */
1384                 if (array == first)
1385                         return;
1386
1387                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1388                         ci = hashmap_steal_first(h);
1389                 else {
1390                         ci = new(ChainCacheItem, 1);
1391                         if (!ci)
1392                                 return;
1393                 }
1394
1395                 ci->first = first;
1396
1397                 if (hashmap_put(h, &ci->first, ci) < 0) {
1398                         free(ci);
1399                         return;
1400                 }
1401         } else
1402                 assert(ci->first == first);
1403
1404         ci->array = array;
1405         ci->begin = begin;
1406         ci->total = total;
1407         ci->last_index = last_index;
1408 }
1409
1410 static int generic_array_get(
1411                 JournalFile *f,
1412                 uint64_t first,
1413                 uint64_t i,
1414                 Object **ret, uint64_t *offset) {
1415
1416         Object *o;
1417         uint64_t p = 0, a, t = 0;
1418         int r;
1419         ChainCacheItem *ci;
1420
1421         assert(f);
1422
1423         a = first;
1424
1425         /* Try the chain cache first */
1426         ci = hashmap_get(f->chain_cache, &first);
1427         if (ci && i > ci->total) {
1428                 a = ci->array;
1429                 i -= ci->total;
1430                 t = ci->total;
1431         }
1432
1433         while (a > 0) {
1434                 uint64_t k;
1435
1436                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1437                 if (r < 0)
1438                         return r;
1439
1440                 k = journal_file_entry_array_n_items(o);
1441                 if (i < k) {
1442                         p = le64toh(o->entry_array.items[i]);
1443                         goto found;
1444                 }
1445
1446                 i -= k;
1447                 t += k;
1448                 a = le64toh(o->entry_array.next_entry_array_offset);
1449         }
1450
1451         return 0;
1452
1453 found:
1454         /* Let's cache this item for the next invocation */
1455         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t, i);
1456
1457         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1458         if (r < 0)
1459                 return r;
1460
1461         if (ret)
1462                 *ret = o;
1463
1464         if (offset)
1465                 *offset = p;
1466
1467         return 1;
1468 }
1469
1470 static int generic_array_get_plus_one(
1471                 JournalFile *f,
1472                 uint64_t extra,
1473                 uint64_t first,
1474                 uint64_t i,
1475                 Object **ret, uint64_t *offset) {
1476
1477         Object *o;
1478
1479         assert(f);
1480
1481         if (i == 0) {
1482                 int r;
1483
1484                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1485                 if (r < 0)
1486                         return r;
1487
1488                 if (ret)
1489                         *ret = o;
1490
1491                 if (offset)
1492                         *offset = extra;
1493
1494                 return 1;
1495         }
1496
1497         return generic_array_get(f, first, i-1, ret, offset);
1498 }
1499
1500 enum {
1501         TEST_FOUND,
1502         TEST_LEFT,
1503         TEST_RIGHT
1504 };
1505
1506 static int generic_array_bisect(
1507                 JournalFile *f,
1508                 uint64_t first,
1509                 uint64_t n,
1510                 uint64_t needle,
1511                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1512                 direction_t direction,
1513                 Object **ret,
1514                 uint64_t *offset,
1515                 uint64_t *idx) {
1516
1517         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1518         bool subtract_one = false;
1519         Object *o, *array = NULL;
1520         int r;
1521         ChainCacheItem *ci;
1522
1523         assert(f);
1524         assert(test_object);
1525
1526         /* Start with the first array in the chain */
1527         a = first;
1528
1529         ci = hashmap_get(f->chain_cache, &first);
1530         if (ci && n > ci->total) {
1531                 /* Ah, we have iterated this bisection array chain
1532                  * previously! Let's see if we can skip ahead in the
1533                  * chain, as far as the last time. But we can't jump
1534                  * backwards in the chain, so let's check that
1535                  * first. */
1536
1537                 r = test_object(f, ci->begin, needle);
1538                 if (r < 0)
1539                         return r;
1540
1541                 if (r == TEST_LEFT) {
1542                         /* OK, what we are looking for is right of the
1543                          * begin of this EntryArray, so let's jump
1544                          * straight to previously cached array in the
1545                          * chain */
1546
1547                         a = ci->array;
1548                         n -= ci->total;
1549                         t = ci->total;
1550                         last_index = ci->last_index;
1551                 }
1552         }
1553
1554         while (a > 0) {
1555                 uint64_t left, right, k, lp;
1556
1557                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1558                 if (r < 0)
1559                         return r;
1560
1561                 k = journal_file_entry_array_n_items(array);
1562                 right = MIN(k, n);
1563                 if (right <= 0)
1564                         return 0;
1565
1566                 i = right - 1;
1567                 lp = p = le64toh(array->entry_array.items[i]);
1568                 if (p <= 0)
1569                         return -EBADMSG;
1570
1571                 r = test_object(f, p, needle);
1572                 if (r < 0)
1573                         return r;
1574
1575                 if (r == TEST_FOUND)
1576                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1577
1578                 if (r == TEST_RIGHT) {
1579                         left = 0;
1580                         right -= 1;
1581
1582                         if (last_index != (uint64_t) -1) {
1583                                 assert(last_index <= right);
1584
1585                                 /* If we cached the last index we
1586                                  * looked at, let's try to not to jump
1587                                  * too wildly around and see if we can
1588                                  * limit the range to look at early to
1589                                  * the immediate neighbors of the last
1590                                  * index we looked at. */
1591
1592                                 if (last_index > 0) {
1593                                         uint64_t x = last_index - 1;
1594
1595                                         p = le64toh(array->entry_array.items[x]);
1596                                         if (p <= 0)
1597                                                 return -EBADMSG;
1598
1599                                         r = test_object(f, p, needle);
1600                                         if (r < 0)
1601                                                 return r;
1602
1603                                         if (r == TEST_FOUND)
1604                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1605
1606                                         if (r == TEST_RIGHT)
1607                                                 right = x;
1608                                         else
1609                                                 left = x + 1;
1610                                 }
1611
1612                                 if (last_index < right) {
1613                                         uint64_t y = last_index + 1;
1614
1615                                         p = le64toh(array->entry_array.items[y]);
1616                                         if (p <= 0)
1617                                                 return -EBADMSG;
1618
1619                                         r = test_object(f, p, needle);
1620                                         if (r < 0)
1621                                                 return r;
1622
1623                                         if (r == TEST_FOUND)
1624                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1625
1626                                         if (r == TEST_RIGHT)
1627                                                 right = y;
1628                                         else
1629                                                 left = y + 1;
1630                                 }
1631                         }
1632
1633                         for (;;) {
1634                                 if (left == right) {
1635                                         if (direction == DIRECTION_UP)
1636                                                 subtract_one = true;
1637
1638                                         i = left;
1639                                         goto found;
1640                                 }
1641
1642                                 assert(left < right);
1643                                 i = (left + right) / 2;
1644
1645                                 p = le64toh(array->entry_array.items[i]);
1646                                 if (p <= 0)
1647                                         return -EBADMSG;
1648
1649                                 r = test_object(f, p, needle);
1650                                 if (r < 0)
1651                                         return r;
1652
1653                                 if (r == TEST_FOUND)
1654                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1655
1656                                 if (r == TEST_RIGHT)
1657                                         right = i;
1658                                 else
1659                                         left = i + 1;
1660                         }
1661                 }
1662
1663                 if (k > n) {
1664                         if (direction == DIRECTION_UP) {
1665                                 i = n;
1666                                 subtract_one = true;
1667                                 goto found;
1668                         }
1669
1670                         return 0;
1671                 }
1672
1673                 last_p = lp;
1674
1675                 n -= k;
1676                 t += k;
1677                 last_index = (uint64_t) -1;
1678                 a = le64toh(array->entry_array.next_entry_array_offset);
1679         }
1680
1681         return 0;
1682
1683 found:
1684         if (subtract_one && t == 0 && i == 0)
1685                 return 0;
1686
1687         /* Let's cache this item for the next invocation */
1688         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1689
1690         if (subtract_one && i == 0)
1691                 p = last_p;
1692         else if (subtract_one)
1693                 p = le64toh(array->entry_array.items[i-1]);
1694         else
1695                 p = le64toh(array->entry_array.items[i]);
1696
1697         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1698         if (r < 0)
1699                 return r;
1700
1701         if (ret)
1702                 *ret = o;
1703
1704         if (offset)
1705                 *offset = p;
1706
1707         if (idx)
1708                 *idx = t + i + (subtract_one ? -1 : 0);
1709
1710         return 1;
1711 }
1712
1713
1714 static int generic_array_bisect_plus_one(
1715                 JournalFile *f,
1716                 uint64_t extra,
1717                 uint64_t first,
1718                 uint64_t n,
1719                 uint64_t needle,
1720                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1721                 direction_t direction,
1722                 Object **ret,
1723                 uint64_t *offset,
1724                 uint64_t *idx) {
1725
1726         int r;
1727         bool step_back = false;
1728         Object *o;
1729
1730         assert(f);
1731         assert(test_object);
1732
1733         if (n <= 0)
1734                 return 0;
1735
1736         /* This bisects the array in object 'first', but first checks
1737          * an extra  */
1738         r = test_object(f, extra, needle);
1739         if (r < 0)
1740                 return r;
1741
1742         if (r == TEST_FOUND)
1743                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1744
1745         /* if we are looking with DIRECTION_UP then we need to first
1746            see if in the actual array there is a matching entry, and
1747            return the last one of that. But if there isn't any we need
1748            to return this one. Hence remember this, and return it
1749            below. */
1750         if (r == TEST_LEFT)
1751                 step_back = direction == DIRECTION_UP;
1752
1753         if (r == TEST_RIGHT) {
1754                 if (direction == DIRECTION_DOWN)
1755                         goto found;
1756                 else
1757                         return 0;
1758         }
1759
1760         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1761
1762         if (r == 0 && step_back)
1763                 goto found;
1764
1765         if (r > 0 && idx)
1766                 (*idx) ++;
1767
1768         return r;
1769
1770 found:
1771         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1772         if (r < 0)
1773                 return r;
1774
1775         if (ret)
1776                 *ret = o;
1777
1778         if (offset)
1779                 *offset = extra;
1780
1781         if (idx)
1782                 *idx = 0;
1783
1784         return 1;
1785 }
1786
1787 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1788         assert(f);
1789         assert(p > 0);
1790
1791         if (p == needle)
1792                 return TEST_FOUND;
1793         else if (p < needle)
1794                 return TEST_LEFT;
1795         else
1796                 return TEST_RIGHT;
1797 }
1798
1799 int journal_file_move_to_entry_by_offset(
1800                 JournalFile *f,
1801                 uint64_t p,
1802                 direction_t direction,
1803                 Object **ret,
1804                 uint64_t *offset) {
1805
1806         return generic_array_bisect(f,
1807                                     le64toh(f->header->entry_array_offset),
1808                                     le64toh(f->header->n_entries),
1809                                     p,
1810                                     test_object_offset,
1811                                     direction,
1812                                     ret, offset, NULL);
1813 }
1814
1815
1816 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1817         Object *o;
1818         int r;
1819
1820         assert(f);
1821         assert(p > 0);
1822
1823         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1824         if (r < 0)
1825                 return r;
1826
1827         if (le64toh(o->entry.seqnum) == needle)
1828                 return TEST_FOUND;
1829         else if (le64toh(o->entry.seqnum) < needle)
1830                 return TEST_LEFT;
1831         else
1832                 return TEST_RIGHT;
1833 }
1834
1835 int journal_file_move_to_entry_by_seqnum(
1836                 JournalFile *f,
1837                 uint64_t seqnum,
1838                 direction_t direction,
1839                 Object **ret,
1840                 uint64_t *offset) {
1841
1842         return generic_array_bisect(f,
1843                                     le64toh(f->header->entry_array_offset),
1844                                     le64toh(f->header->n_entries),
1845                                     seqnum,
1846                                     test_object_seqnum,
1847                                     direction,
1848                                     ret, offset, NULL);
1849 }
1850
1851 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1852         Object *o;
1853         int r;
1854
1855         assert(f);
1856         assert(p > 0);
1857
1858         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1859         if (r < 0)
1860                 return r;
1861
1862         if (le64toh(o->entry.realtime) == needle)
1863                 return TEST_FOUND;
1864         else if (le64toh(o->entry.realtime) < needle)
1865                 return TEST_LEFT;
1866         else
1867                 return TEST_RIGHT;
1868 }
1869
1870 int journal_file_move_to_entry_by_realtime(
1871                 JournalFile *f,
1872                 uint64_t realtime,
1873                 direction_t direction,
1874                 Object **ret,
1875                 uint64_t *offset) {
1876
1877         return generic_array_bisect(f,
1878                                     le64toh(f->header->entry_array_offset),
1879                                     le64toh(f->header->n_entries),
1880                                     realtime,
1881                                     test_object_realtime,
1882                                     direction,
1883                                     ret, offset, NULL);
1884 }
1885
1886 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1887         Object *o;
1888         int r;
1889
1890         assert(f);
1891         assert(p > 0);
1892
1893         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1894         if (r < 0)
1895                 return r;
1896
1897         if (le64toh(o->entry.monotonic) == needle)
1898                 return TEST_FOUND;
1899         else if (le64toh(o->entry.monotonic) < needle)
1900                 return TEST_LEFT;
1901         else
1902                 return TEST_RIGHT;
1903 }
1904
1905 static inline int find_data_object_by_boot_id(
1906                 JournalFile *f,
1907                 sd_id128_t boot_id,
1908                 Object **o,
1909                 uint64_t *b) {
1910         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1911
1912         sd_id128_to_string(boot_id, t + 9);
1913         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1914 }
1915
1916 int journal_file_move_to_entry_by_monotonic(
1917                 JournalFile *f,
1918                 sd_id128_t boot_id,
1919                 uint64_t monotonic,
1920                 direction_t direction,
1921                 Object **ret,
1922                 uint64_t *offset) {
1923
1924         Object *o;
1925         int r;
1926
1927         assert(f);
1928
1929         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1930         if (r < 0)
1931                 return r;
1932         if (r == 0)
1933                 return -ENOENT;
1934
1935         return generic_array_bisect_plus_one(f,
1936                                              le64toh(o->data.entry_offset),
1937                                              le64toh(o->data.entry_array_offset),
1938                                              le64toh(o->data.n_entries),
1939                                              monotonic,
1940                                              test_object_monotonic,
1941                                              direction,
1942                                              ret, offset, NULL);
1943 }
1944
1945 int journal_file_next_entry(
1946                 JournalFile *f,
1947                 Object *o, uint64_t p,
1948                 direction_t direction,
1949                 Object **ret, uint64_t *offset) {
1950
1951         uint64_t i, n;
1952         int r;
1953
1954         assert(f);
1955         assert(p > 0 || !o);
1956
1957         n = le64toh(f->header->n_entries);
1958         if (n <= 0)
1959                 return 0;
1960
1961         if (!o)
1962                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1963         else {
1964                 if (o->object.type != OBJECT_ENTRY)
1965                         return -EINVAL;
1966
1967                 r = generic_array_bisect(f,
1968                                          le64toh(f->header->entry_array_offset),
1969                                          le64toh(f->header->n_entries),
1970                                          p,
1971                                          test_object_offset,
1972                                          DIRECTION_DOWN,
1973                                          NULL, NULL,
1974                                          &i);
1975                 if (r <= 0)
1976                         return r;
1977
1978                 if (direction == DIRECTION_DOWN) {
1979                         if (i >= n - 1)
1980                                 return 0;
1981
1982                         i++;
1983                 } else {
1984                         if (i <= 0)
1985                                 return 0;
1986
1987                         i--;
1988                 }
1989         }
1990
1991         /* And jump to it */
1992         return generic_array_get(f,
1993                                  le64toh(f->header->entry_array_offset),
1994                                  i,
1995                                  ret, offset);
1996 }
1997
1998 int journal_file_skip_entry(
1999                 JournalFile *f,
2000                 Object *o, uint64_t p,
2001                 int64_t skip,
2002                 Object **ret, uint64_t *offset) {
2003
2004         uint64_t i, n;
2005         int r;
2006
2007         assert(f);
2008         assert(o);
2009         assert(p > 0);
2010
2011         if (o->object.type != OBJECT_ENTRY)
2012                 return -EINVAL;
2013
2014         r = generic_array_bisect(f,
2015                                  le64toh(f->header->entry_array_offset),
2016                                  le64toh(f->header->n_entries),
2017                                  p,
2018                                  test_object_offset,
2019                                  DIRECTION_DOWN,
2020                                  NULL, NULL,
2021                                  &i);
2022         if (r <= 0)
2023                 return r;
2024
2025         /* Calculate new index */
2026         if (skip < 0) {
2027                 if ((uint64_t) -skip >= i)
2028                         i = 0;
2029                 else
2030                         i = i - (uint64_t) -skip;
2031         } else
2032                 i  += (uint64_t) skip;
2033
2034         n = le64toh(f->header->n_entries);
2035         if (n <= 0)
2036                 return -EBADMSG;
2037
2038         if (i >= n)
2039                 i = n-1;
2040
2041         return generic_array_get(f,
2042                                  le64toh(f->header->entry_array_offset),
2043                                  i,
2044                                  ret, offset);
2045 }
2046
2047 int journal_file_next_entry_for_data(
2048                 JournalFile *f,
2049                 Object *o, uint64_t p,
2050                 uint64_t data_offset,
2051                 direction_t direction,
2052                 Object **ret, uint64_t *offset) {
2053
2054         uint64_t n, i;
2055         int r;
2056         Object *d;
2057
2058         assert(f);
2059         assert(p > 0 || !o);
2060
2061         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2062         if (r < 0)
2063                 return r;
2064
2065         n = le64toh(d->data.n_entries);
2066         if (n <= 0)
2067                 return n;
2068
2069         if (!o)
2070                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2071         else {
2072                 if (o->object.type != OBJECT_ENTRY)
2073                         return -EINVAL;
2074
2075                 r = generic_array_bisect_plus_one(f,
2076                                                   le64toh(d->data.entry_offset),
2077                                                   le64toh(d->data.entry_array_offset),
2078                                                   le64toh(d->data.n_entries),
2079                                                   p,
2080                                                   test_object_offset,
2081                                                   DIRECTION_DOWN,
2082                                                   NULL, NULL,
2083                                                   &i);
2084
2085                 if (r <= 0)
2086                         return r;
2087
2088                 if (direction == DIRECTION_DOWN) {
2089                         if (i >= n - 1)
2090                                 return 0;
2091
2092                         i++;
2093                 } else {
2094                         if (i <= 0)
2095                                 return 0;
2096
2097                         i--;
2098                 }
2099
2100         }
2101
2102         return generic_array_get_plus_one(f,
2103                                           le64toh(d->data.entry_offset),
2104                                           le64toh(d->data.entry_array_offset),
2105                                           i,
2106                                           ret, offset);
2107 }
2108
2109 int journal_file_move_to_entry_by_offset_for_data(
2110                 JournalFile *f,
2111                 uint64_t data_offset,
2112                 uint64_t p,
2113                 direction_t direction,
2114                 Object **ret, uint64_t *offset) {
2115
2116         int r;
2117         Object *d;
2118
2119         assert(f);
2120
2121         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2122         if (r < 0)
2123                 return r;
2124
2125         return generic_array_bisect_plus_one(f,
2126                                              le64toh(d->data.entry_offset),
2127                                              le64toh(d->data.entry_array_offset),
2128                                              le64toh(d->data.n_entries),
2129                                              p,
2130                                              test_object_offset,
2131                                              direction,
2132                                              ret, offset, NULL);
2133 }
2134
2135 int journal_file_move_to_entry_by_monotonic_for_data(
2136                 JournalFile *f,
2137                 uint64_t data_offset,
2138                 sd_id128_t boot_id,
2139                 uint64_t monotonic,
2140                 direction_t direction,
2141                 Object **ret, uint64_t *offset) {
2142
2143         Object *o, *d;
2144         int r;
2145         uint64_t b, z;
2146
2147         assert(f);
2148
2149         /* First, seek by time */
2150         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2151         if (r < 0)
2152                 return r;
2153         if (r == 0)
2154                 return -ENOENT;
2155
2156         r = generic_array_bisect_plus_one(f,
2157                                           le64toh(o->data.entry_offset),
2158                                           le64toh(o->data.entry_array_offset),
2159                                           le64toh(o->data.n_entries),
2160                                           monotonic,
2161                                           test_object_monotonic,
2162                                           direction,
2163                                           NULL, &z, NULL);
2164         if (r <= 0)
2165                 return r;
2166
2167         /* And now, continue seeking until we find an entry that
2168          * exists in both bisection arrays */
2169
2170         for (;;) {
2171                 Object *qo;
2172                 uint64_t p, q;
2173
2174                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2175                 if (r < 0)
2176                         return r;
2177
2178                 r = generic_array_bisect_plus_one(f,
2179                                                   le64toh(d->data.entry_offset),
2180                                                   le64toh(d->data.entry_array_offset),
2181                                                   le64toh(d->data.n_entries),
2182                                                   z,
2183                                                   test_object_offset,
2184                                                   direction,
2185                                                   NULL, &p, NULL);
2186                 if (r <= 0)
2187                         return r;
2188
2189                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2190                 if (r < 0)
2191                         return r;
2192
2193                 r = generic_array_bisect_plus_one(f,
2194                                                   le64toh(o->data.entry_offset),
2195                                                   le64toh(o->data.entry_array_offset),
2196                                                   le64toh(o->data.n_entries),
2197                                                   p,
2198                                                   test_object_offset,
2199                                                   direction,
2200                                                   &qo, &q, NULL);
2201
2202                 if (r <= 0)
2203                         return r;
2204
2205                 if (p == q) {
2206                         if (ret)
2207                                 *ret = qo;
2208                         if (offset)
2209                                 *offset = q;
2210
2211                         return 1;
2212                 }
2213
2214                 z = q;
2215         }
2216
2217         return 0;
2218 }
2219
2220 int journal_file_move_to_entry_by_seqnum_for_data(
2221                 JournalFile *f,
2222                 uint64_t data_offset,
2223                 uint64_t seqnum,
2224                 direction_t direction,
2225                 Object **ret, uint64_t *offset) {
2226
2227         Object *d;
2228         int r;
2229
2230         assert(f);
2231
2232         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2233         if (r < 0)
2234                 return r;
2235
2236         return generic_array_bisect_plus_one(f,
2237                                              le64toh(d->data.entry_offset),
2238                                              le64toh(d->data.entry_array_offset),
2239                                              le64toh(d->data.n_entries),
2240                                              seqnum,
2241                                              test_object_seqnum,
2242                                              direction,
2243                                              ret, offset, NULL);
2244 }
2245
2246 int journal_file_move_to_entry_by_realtime_for_data(
2247                 JournalFile *f,
2248                 uint64_t data_offset,
2249                 uint64_t realtime,
2250                 direction_t direction,
2251                 Object **ret, uint64_t *offset) {
2252
2253         Object *d;
2254         int r;
2255
2256         assert(f);
2257
2258         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2259         if (r < 0)
2260                 return r;
2261
2262         return generic_array_bisect_plus_one(f,
2263                                              le64toh(d->data.entry_offset),
2264                                              le64toh(d->data.entry_array_offset),
2265                                              le64toh(d->data.n_entries),
2266                                              realtime,
2267                                              test_object_realtime,
2268                                              direction,
2269                                              ret, offset, NULL);
2270 }
2271
2272 void journal_file_dump(JournalFile *f) {
2273         Object *o;
2274         int r;
2275         uint64_t p;
2276
2277         assert(f);
2278
2279         journal_file_print_header(f);
2280
2281         p = le64toh(f->header->header_size);
2282         while (p != 0) {
2283                 r = journal_file_move_to_object(f, -1, p, &o);
2284                 if (r < 0)
2285                         goto fail;
2286
2287                 switch (o->object.type) {
2288
2289                 case OBJECT_UNUSED:
2290                         printf("Type: OBJECT_UNUSED\n");
2291                         break;
2292
2293                 case OBJECT_DATA:
2294                         printf("Type: OBJECT_DATA\n");
2295                         break;
2296
2297                 case OBJECT_FIELD:
2298                         printf("Type: OBJECT_FIELD\n");
2299                         break;
2300
2301                 case OBJECT_ENTRY:
2302                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2303                                le64toh(o->entry.seqnum),
2304                                le64toh(o->entry.monotonic),
2305                                le64toh(o->entry.realtime));
2306                         break;
2307
2308                 case OBJECT_FIELD_HASH_TABLE:
2309                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2310                         break;
2311
2312                 case OBJECT_DATA_HASH_TABLE:
2313                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2314                         break;
2315
2316                 case OBJECT_ENTRY_ARRAY:
2317                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2318                         break;
2319
2320                 case OBJECT_TAG:
2321                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2322                                le64toh(o->tag.seqnum),
2323                                le64toh(o->tag.epoch));
2324                         break;
2325
2326                 default:
2327                         printf("Type: unknown (%u)\n", o->object.type);
2328                         break;
2329                 }
2330
2331                 if (o->object.flags & OBJECT_COMPRESSED)
2332                         printf("Flags: COMPRESSED\n");
2333
2334                 if (p == le64toh(f->header->tail_object_offset))
2335                         p = 0;
2336                 else
2337                         p = p + ALIGN64(le64toh(o->object.size));
2338         }
2339
2340         return;
2341 fail:
2342         log_error("File corrupt");
2343 }
2344
2345 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2346         const char *x;
2347
2348         x = format_timestamp(buf, l, t);
2349         if (x)
2350                 return x;
2351         return " --- ";
2352 }
2353
2354 void journal_file_print_header(JournalFile *f) {
2355         char a[33], b[33], c[33], d[33];
2356         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2357         struct stat st;
2358         char bytes[FORMAT_BYTES_MAX];
2359
2360         assert(f);
2361
2362         printf("File Path: %s\n"
2363                "File ID: %s\n"
2364                "Machine ID: %s\n"
2365                "Boot ID: %s\n"
2366                "Sequential Number ID: %s\n"
2367                "State: %s\n"
2368                "Compatible Flags:%s%s\n"
2369                "Incompatible Flags:%s%s\n"
2370                "Header size: %"PRIu64"\n"
2371                "Arena size: %"PRIu64"\n"
2372                "Data Hash Table Size: %"PRIu64"\n"
2373                "Field Hash Table Size: %"PRIu64"\n"
2374                "Rotate Suggested: %s\n"
2375                "Head Sequential Number: %"PRIu64"\n"
2376                "Tail Sequential Number: %"PRIu64"\n"
2377                "Head Realtime Timestamp: %s\n"
2378                "Tail Realtime Timestamp: %s\n"
2379                "Tail Monotonic Timestamp: %s\n"
2380                "Objects: %"PRIu64"\n"
2381                "Entry Objects: %"PRIu64"\n",
2382                f->path,
2383                sd_id128_to_string(f->header->file_id, a),
2384                sd_id128_to_string(f->header->machine_id, b),
2385                sd_id128_to_string(f->header->boot_id, c),
2386                sd_id128_to_string(f->header->seqnum_id, d),
2387                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2388                f->header->state == STATE_ONLINE ? "ONLINE" :
2389                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2390                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2391                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2392                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2393                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2394                le64toh(f->header->header_size),
2395                le64toh(f->header->arena_size),
2396                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2397                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2398                yes_no(journal_file_rotate_suggested(f, 0)),
2399                le64toh(f->header->head_entry_seqnum),
2400                le64toh(f->header->tail_entry_seqnum),
2401                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2402                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2403                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2404                le64toh(f->header->n_objects),
2405                le64toh(f->header->n_entries));
2406
2407         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2408                 printf("Data Objects: %"PRIu64"\n"
2409                        "Data Hash Table Fill: %.1f%%\n",
2410                        le64toh(f->header->n_data),
2411                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2412
2413         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2414                 printf("Field Objects: %"PRIu64"\n"
2415                        "Field Hash Table Fill: %.1f%%\n",
2416                        le64toh(f->header->n_fields),
2417                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2418
2419         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2420                 printf("Tag Objects: %"PRIu64"\n",
2421                        le64toh(f->header->n_tags));
2422         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2423                 printf("Entry Array Objects: %"PRIu64"\n",
2424                        le64toh(f->header->n_entry_arrays));
2425
2426         if (fstat(f->fd, &st) >= 0)
2427                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2428 }
2429
2430 int journal_file_open(
2431                 const char *fname,
2432                 int flags,
2433                 mode_t mode,
2434                 bool compress,
2435                 bool seal,
2436                 JournalMetrics *metrics,
2437                 MMapCache *mmap_cache,
2438                 JournalFile *template,
2439                 JournalFile **ret) {
2440
2441         JournalFile *f;
2442         int r;
2443         bool newly_created = false;
2444
2445         assert(fname);
2446         assert(ret);
2447
2448         if ((flags & O_ACCMODE) != O_RDONLY &&
2449             (flags & O_ACCMODE) != O_RDWR)
2450                 return -EINVAL;
2451
2452         if (!endswith(fname, ".journal") &&
2453             !endswith(fname, ".journal~"))
2454                 return -EINVAL;
2455
2456         f = new0(JournalFile, 1);
2457         if (!f)
2458                 return -ENOMEM;
2459
2460         f->fd = -1;
2461         f->mode = mode;
2462
2463         f->flags = flags;
2464         f->prot = prot_from_flags(flags);
2465         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2466 #ifdef HAVE_XZ
2467         f->compress = compress;
2468 #endif
2469 #ifdef HAVE_GCRYPT
2470         f->seal = seal;
2471 #endif
2472
2473         if (mmap_cache)
2474                 f->mmap = mmap_cache_ref(mmap_cache);
2475         else {
2476                 f->mmap = mmap_cache_new();
2477                 if (!f->mmap) {
2478                         r = -ENOMEM;
2479                         goto fail;
2480                 }
2481         }
2482
2483         f->path = strdup(fname);
2484         if (!f->path) {
2485                 r = -ENOMEM;
2486                 goto fail;
2487         }
2488
2489         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2490         if (!f->chain_cache) {
2491                 r = -ENOMEM;
2492                 goto fail;
2493         }
2494
2495         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2496         if (f->fd < 0) {
2497                 r = -errno;
2498                 goto fail;
2499         }
2500
2501         if (fstat(f->fd, &f->last_stat) < 0) {
2502                 r = -errno;
2503                 goto fail;
2504         }
2505
2506         if (f->last_stat.st_size == 0 && f->writable) {
2507 #ifdef HAVE_XATTR
2508                 uint64_t crtime;
2509
2510                 /* Let's attach the creation time to the journal file,
2511                  * so that the vacuuming code knows the age of this
2512                  * file even if the file might end up corrupted one
2513                  * day... Ideally we'd just use the creation time many
2514                  * file systems maintain for each file, but there is
2515                  * currently no usable API to query this, hence let's
2516                  * emulate this via extended attributes. If extended
2517                  * attributes are not supported we'll just skip this,
2518                  * and rely solely on mtime/atime/ctime of the file.*/
2519
2520                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2521                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2522 #endif
2523
2524 #ifdef HAVE_GCRYPT
2525                 /* Try to load the FSPRG state, and if we can't, then
2526                  * just don't do sealing */
2527                 if (f->seal) {
2528                         r = journal_file_fss_load(f);
2529                         if (r < 0)
2530                                 f->seal = false;
2531                 }
2532 #endif
2533
2534                 r = journal_file_init_header(f, template);
2535                 if (r < 0)
2536                         goto fail;
2537
2538                 if (fstat(f->fd, &f->last_stat) < 0) {
2539                         r = -errno;
2540                         goto fail;
2541                 }
2542
2543                 newly_created = true;
2544         }
2545
2546         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2547                 r = -EIO;
2548                 goto fail;
2549         }
2550
2551         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2552         if (f->header == MAP_FAILED) {
2553                 f->header = NULL;
2554                 r = -errno;
2555                 goto fail;
2556         }
2557
2558         if (!newly_created) {
2559                 r = journal_file_verify_header(f);
2560                 if (r < 0)
2561                         goto fail;
2562         }
2563
2564 #ifdef HAVE_GCRYPT
2565         if (!newly_created && f->writable) {
2566                 r = journal_file_fss_load(f);
2567                 if (r < 0)
2568                         goto fail;
2569         }
2570 #endif
2571
2572         if (f->writable) {
2573                 if (metrics) {
2574                         journal_default_metrics(metrics, f->fd);
2575                         f->metrics = *metrics;
2576                 } else if (template)
2577                         f->metrics = template->metrics;
2578
2579                 r = journal_file_refresh_header(f);
2580                 if (r < 0)
2581                         goto fail;
2582         }
2583
2584 #ifdef HAVE_GCRYPT
2585         r = journal_file_hmac_setup(f);
2586         if (r < 0)
2587                 goto fail;
2588 #endif
2589
2590         if (newly_created) {
2591                 r = journal_file_setup_field_hash_table(f);
2592                 if (r < 0)
2593                         goto fail;
2594
2595                 r = journal_file_setup_data_hash_table(f);
2596                 if (r < 0)
2597                         goto fail;
2598
2599 #ifdef HAVE_GCRYPT
2600                 r = journal_file_append_first_tag(f);
2601                 if (r < 0)
2602                         goto fail;
2603 #endif
2604         }
2605
2606         r = journal_file_map_field_hash_table(f);
2607         if (r < 0)
2608                 goto fail;
2609
2610         r = journal_file_map_data_hash_table(f);
2611         if (r < 0)
2612                 goto fail;
2613
2614         *ret = f;
2615         return 0;
2616
2617 fail:
2618         journal_file_close(f);
2619
2620         return r;
2621 }
2622
2623 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2624         _cleanup_free_ char *p = NULL;
2625         size_t l;
2626         JournalFile *old_file, *new_file = NULL;
2627         int r;
2628
2629         assert(f);
2630         assert(*f);
2631
2632         old_file = *f;
2633
2634         if (!old_file->writable)
2635                 return -EINVAL;
2636
2637         if (!endswith(old_file->path, ".journal"))
2638                 return -EINVAL;
2639
2640         l = strlen(old_file->path);
2641         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2642                      (int) l - 8, old_file->path,
2643                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2644                      le64toh((*f)->header->head_entry_seqnum),
2645                      le64toh((*f)->header->head_entry_realtime));
2646         if (r < 0)
2647                 return -ENOMEM;
2648
2649         r = rename(old_file->path, p);
2650         if (r < 0)
2651                 return -errno;
2652
2653         old_file->header->state = STATE_ARCHIVED;
2654
2655         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2656         journal_file_close(old_file);
2657
2658         *f = new_file;
2659         return r;
2660 }
2661
2662 int journal_file_open_reliably(
2663                 const char *fname,
2664                 int flags,
2665                 mode_t mode,
2666                 bool compress,
2667                 bool seal,
2668                 JournalMetrics *metrics,
2669                 MMapCache *mmap_cache,
2670                 JournalFile *template,
2671                 JournalFile **ret) {
2672
2673         int r;
2674         size_t l;
2675         _cleanup_free_ char *p = NULL;
2676
2677         r = journal_file_open(fname, flags, mode, compress, seal,
2678                               metrics, mmap_cache, template, ret);
2679         if (r != -EBADMSG && /* corrupted */
2680             r != -ENODATA && /* truncated */
2681             r != -EHOSTDOWN && /* other machine */
2682             r != -EPROTONOSUPPORT && /* incompatible feature */
2683             r != -EBUSY && /* unclean shutdown */
2684             r != -ESHUTDOWN /* already archived */)
2685                 return r;
2686
2687         if ((flags & O_ACCMODE) == O_RDONLY)
2688                 return r;
2689
2690         if (!(flags & O_CREAT))
2691                 return r;
2692
2693         if (!endswith(fname, ".journal"))
2694                 return r;
2695
2696         /* The file is corrupted. Rotate it away and try it again (but only once) */
2697
2698         l = strlen(fname);
2699         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2700                      (int) l - 8, fname,
2701                      (unsigned long long) now(CLOCK_REALTIME),
2702                      random_ull()) < 0)
2703                 return -ENOMEM;
2704
2705         r = rename(fname, p);
2706         if (r < 0)
2707                 return -errno;
2708
2709         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2710
2711         return journal_file_open(fname, flags, mode, compress, seal,
2712                                  metrics, mmap_cache, template, ret);
2713 }
2714
2715 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2716         uint64_t i, n;
2717         uint64_t q, xor_hash = 0;
2718         int r;
2719         EntryItem *items;
2720         dual_timestamp ts;
2721
2722         assert(from);
2723         assert(to);
2724         assert(o);
2725         assert(p);
2726
2727         if (!to->writable)
2728                 return -EPERM;
2729
2730         ts.monotonic = le64toh(o->entry.monotonic);
2731         ts.realtime = le64toh(o->entry.realtime);
2732
2733         n = journal_file_entry_n_items(o);
2734         /* alloca() can't take 0, hence let's allocate at least one */
2735         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2736
2737         for (i = 0; i < n; i++) {
2738                 uint64_t l, h;
2739                 le64_t le_hash;
2740                 size_t t;
2741                 void *data;
2742                 Object *u;
2743
2744                 q = le64toh(o->entry.items[i].object_offset);
2745                 le_hash = o->entry.items[i].hash;
2746
2747                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2748                 if (r < 0)
2749                         return r;
2750
2751                 if (le_hash != o->data.hash)
2752                         return -EBADMSG;
2753
2754                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2755                 t = (size_t) l;
2756
2757                 /* We hit the limit on 32bit machines */
2758                 if ((uint64_t) t != l)
2759                         return -E2BIG;
2760
2761                 if (o->object.flags & OBJECT_COMPRESSED) {
2762 #ifdef HAVE_XZ
2763                         uint64_t rsize;
2764
2765                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2766                                 return -EBADMSG;
2767
2768                         data = from->compress_buffer;
2769                         l = rsize;
2770 #else
2771                         return -EPROTONOSUPPORT;
2772 #endif
2773                 } else
2774                         data = o->data.payload;
2775
2776                 r = journal_file_append_data(to, data, l, &u, &h);
2777                 if (r < 0)
2778                         return r;
2779
2780                 xor_hash ^= le64toh(u->data.hash);
2781                 items[i].object_offset = htole64(h);
2782                 items[i].hash = u->data.hash;
2783
2784                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2785                 if (r < 0)
2786                         return r;
2787         }
2788
2789         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2790 }
2791
2792 void journal_default_metrics(JournalMetrics *m, int fd) {
2793         uint64_t fs_size = 0;
2794         struct statvfs ss;
2795         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2796
2797         assert(m);
2798         assert(fd >= 0);
2799
2800         if (fstatvfs(fd, &ss) >= 0)
2801                 fs_size = ss.f_frsize * ss.f_blocks;
2802
2803         if (m->max_use == (uint64_t) -1) {
2804
2805                 if (fs_size > 0) {
2806                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2807
2808                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2809                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2810
2811                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2812                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2813                 } else
2814                         m->max_use = DEFAULT_MAX_USE_LOWER;
2815         } else {
2816                 m->max_use = PAGE_ALIGN(m->max_use);
2817
2818                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2819                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2820         }
2821
2822         if (m->max_size == (uint64_t) -1) {
2823                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2824
2825                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2826                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2827         } else
2828                 m->max_size = PAGE_ALIGN(m->max_size);
2829
2830         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2831                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2832
2833         if (m->max_size*2 > m->max_use)
2834                 m->max_use = m->max_size*2;
2835
2836         if (m->min_size == (uint64_t) -1)
2837                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2838         else {
2839                 m->min_size = PAGE_ALIGN(m->min_size);
2840
2841                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2842                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2843
2844                 if (m->min_size > m->max_size)
2845                         m->max_size = m->min_size;
2846         }
2847
2848         if (m->keep_free == (uint64_t) -1) {
2849
2850                 if (fs_size > 0) {
2851                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2852
2853                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2854                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2855
2856                 } else
2857                         m->keep_free = DEFAULT_KEEP_FREE;
2858         }
2859
2860         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2861                   format_bytes(a, sizeof(a), m->max_use),
2862                   format_bytes(b, sizeof(b), m->max_size),
2863                   format_bytes(c, sizeof(c), m->min_size),
2864                   format_bytes(d, sizeof(d), m->keep_free));
2865 }
2866
2867 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2868         assert(f);
2869         assert(from || to);
2870
2871         if (from) {
2872                 if (f->header->head_entry_realtime == 0)
2873                         return -ENOENT;
2874
2875                 *from = le64toh(f->header->head_entry_realtime);
2876         }
2877
2878         if (to) {
2879                 if (f->header->tail_entry_realtime == 0)
2880                         return -ENOENT;
2881
2882                 *to = le64toh(f->header->tail_entry_realtime);
2883         }
2884
2885         return 1;
2886 }
2887
2888 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2889         Object *o;
2890         uint64_t p;
2891         int r;
2892
2893         assert(f);
2894         assert(from || to);
2895
2896         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2897         if (r <= 0)
2898                 return r;
2899
2900         if (le64toh(o->data.n_entries) <= 0)
2901                 return 0;
2902
2903         if (from) {
2904                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2905                 if (r < 0)
2906                         return r;
2907
2908                 *from = le64toh(o->entry.monotonic);
2909         }
2910
2911         if (to) {
2912                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2913                 if (r < 0)
2914                         return r;
2915
2916                 r = generic_array_get_plus_one(f,
2917                                                le64toh(o->data.entry_offset),
2918                                                le64toh(o->data.entry_array_offset),
2919                                                le64toh(o->data.n_entries)-1,
2920                                                &o, NULL);
2921                 if (r <= 0)
2922                         return r;
2923
2924                 *to = le64toh(o->entry.monotonic);
2925         }
2926
2927         return 1;
2928 }
2929
2930 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2931         assert(f);
2932
2933         /* If we gained new header fields we gained new features,
2934          * hence suggest a rotation */
2935         if (le64toh(f->header->header_size) < sizeof(Header)) {
2936                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2937                 return true;
2938         }
2939
2940         /* Let's check if the hash tables grew over a certain fill
2941          * level (75%, borrowing this value from Java's hash table
2942          * implementation), and if so suggest a rotation. To calculate
2943          * the fill level we need the n_data field, which only exists
2944          * in newer versions. */
2945
2946         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2947                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2948                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2949                                   f->path,
2950                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2951                                   le64toh(f->header->n_data),
2952                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2953                                   (unsigned long long) f->last_stat.st_size,
2954                                   f->last_stat.st_size / le64toh(f->header->n_data));
2955                         return true;
2956                 }
2957
2958         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2959                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2960                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2961                                   f->path,
2962                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2963                                   le64toh(f->header->n_fields),
2964                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2965                         return true;
2966                 }
2967
2968         /* Are the data objects properly indexed by field objects? */
2969         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2970             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2971             le64toh(f->header->n_data) > 0 &&
2972             le64toh(f->header->n_fields) == 0)
2973                 return true;
2974
2975         if (max_file_usec > 0) {
2976                 usec_t t, h;
2977
2978                 h = le64toh(f->header->head_entry_realtime);
2979                 t = now(CLOCK_REALTIME);
2980
2981                 if (h > 0 && t > h + max_file_usec)
2982                         return true;
2983         }
2984
2985         return false;
2986 }