chiark / gitweb /
986e94de392ac12011c27e16352e79638a79a2b1
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <sys/xattr.h>
30
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
34 #include "lookup3.h"
35 #include "compress.h"
36 #include "fsprg.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47  * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55  * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59  * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
70
71 static int journal_file_set_online(JournalFile *f) {
72         assert(f);
73
74         if (!f->writable)
75                 return -EPERM;
76
77         if (!(f->fd >= 0 && f->header))
78                 return -EINVAL;
79
80         switch(f->header->state) {
81                 case STATE_ONLINE:
82                         return 0;
83
84                 case STATE_OFFLINE:
85                         f->header->state = STATE_ONLINE;
86                         fsync(f->fd);
87                         return 0;
88
89                 default:
90                         return -EINVAL;
91         }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95         assert(f);
96
97         if (!f->writable)
98                 return -EPERM;
99
100         if (!(f->fd >= 0 && f->header))
101                 return -EINVAL;
102
103         if (f->header->state != STATE_ONLINE)
104                 return 0;
105
106         fsync(f->fd);
107
108         f->header->state = STATE_OFFLINE;
109
110         fsync(f->fd);
111
112         return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116         assert(f);
117
118 #ifdef HAVE_GCRYPT
119         /* Write the final tag */
120         if (f->seal && f->writable)
121                 journal_file_append_tag(f);
122 #endif
123
124         /* Sync everything to disk, before we mark the file offline */
125         if (f->mmap && f->fd >= 0)
126                 mmap_cache_close_fd(f->mmap, f->fd);
127
128         journal_file_set_offline(f);
129
130         if (f->header)
131                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133         safe_close(f->fd);
134         free(f->path);
135
136         if (f->mmap)
137                 mmap_cache_unref(f->mmap);
138
139         hashmap_free_free(f->chain_cache);
140
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142         free(f->compress_buffer);
143 #endif
144
145 #ifdef HAVE_GCRYPT
146         if (f->fss_file)
147                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148         else if (f->fsprg_state)
149                 free(f->fsprg_state);
150
151         free(f->fsprg_seed);
152
153         if (f->hmac)
154                 gcry_md_close(f->hmac);
155 #endif
156
157         free(f);
158 }
159
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
161         Header h = {};
162         ssize_t k;
163         int r;
164
165         assert(f);
166
167         memcpy(h.signature, HEADER_SIGNATURE, 8);
168         h.header_size = htole64(ALIGN64(sizeof(h)));
169
170         h.incompatible_flags |= htole32(
171                 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172                 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
173
174         h.compatible_flags = htole32(
175                 f->seal * HEADER_COMPATIBLE_SEALED);
176
177         r = sd_id128_randomize(&h.file_id);
178         if (r < 0)
179                 return r;
180
181         if (template) {
182                 h.seqnum_id = template->header->seqnum_id;
183                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
184         } else
185                 h.seqnum_id = h.file_id;
186
187         k = pwrite(f->fd, &h, sizeof(h), 0);
188         if (k < 0)
189                 return -errno;
190
191         if (k != sizeof(h))
192                 return -EIO;
193
194         return 0;
195 }
196
197 static int journal_file_refresh_header(JournalFile *f) {
198         int r;
199         sd_id128_t boot_id;
200
201         assert(f);
202
203         r = sd_id128_get_machine(&f->header->machine_id);
204         if (r < 0)
205                 return r;
206
207         r = sd_id128_get_boot(&boot_id);
208         if (r < 0)
209                 return r;
210
211         if (sd_id128_equal(boot_id, f->header->boot_id))
212                 f->tail_entry_monotonic_valid = true;
213
214         f->header->boot_id = boot_id;
215
216         journal_file_set_online(f);
217
218         /* Sync the online state to disk */
219         fsync(f->fd);
220
221         return 0;
222 }
223
224 static int journal_file_verify_header(JournalFile *f) {
225         uint32_t flags;
226
227         assert(f);
228
229         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
230                 return -EBADMSG;
231
232         /* In both read and write mode we refuse to open files with
233          * incompatible flags we don't know */
234         flags = le32toh(f->header->incompatible_flags);
235         if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236                 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237                         log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238                                   f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239                 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240                 if (flags)
241                         log_debug("Journal file %s uses incompatible flags %"PRIx32
242                                   " disabled at compilation time.", f->path, flags);
243                 return -EPROTONOSUPPORT;
244         }
245
246         /* When open for writing we refuse to open files with
247          * compatible flags, too */
248         flags = le32toh(f->header->compatible_flags);
249         if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250                 if (flags & ~HEADER_COMPATIBLE_ANY)
251                         log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252                                   f->path, flags & ~HEADER_COMPATIBLE_ANY);
253                 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254                 if (flags)
255                         log_debug("Journal file %s uses compatible flags %"PRIx32
256                                   " disabled at compilation time.", f->path, flags);
257                 return -EPROTONOSUPPORT;
258         }
259
260         if (f->header->state >= _STATE_MAX)
261                 return -EBADMSG;
262
263         /* The first addition was n_data, so check that we are at least this large */
264         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
265                 return -EBADMSG;
266
267         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
268                 return -EBADMSG;
269
270         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271                 return -ENODATA;
272
273         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274                 return -ENODATA;
275
276         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278             !VALID64(le64toh(f->header->tail_object_offset)) ||
279             !VALID64(le64toh(f->header->entry_array_offset)))
280                 return -ENODATA;
281
282         if (f->writable) {
283                 uint8_t state;
284                 sd_id128_t machine_id;
285                 int r;
286
287                 r = sd_id128_get_machine(&machine_id);
288                 if (r < 0)
289                         return r;
290
291                 if (!sd_id128_equal(machine_id, f->header->machine_id))
292                         return -EHOSTDOWN;
293
294                 state = f->header->state;
295
296                 if (state == STATE_ONLINE) {
297                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298                         return -EBUSY;
299                 } else if (state == STATE_ARCHIVED)
300                         return -ESHUTDOWN;
301                 else if (state != STATE_OFFLINE) {
302                         log_debug("Journal file %s has unknown state %u.", f->path, state);
303                         return -EBUSY;
304                 }
305         }
306
307         f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308         f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
309
310         f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312         return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316         uint64_t old_size, new_size;
317         int r;
318
319         assert(f);
320
321         /* We assume that this file is not sparse, and we know that
322          * for sure, since we always call posix_fallocate()
323          * ourselves */
324
325         old_size =
326                 le64toh(f->header->header_size) +
327                 le64toh(f->header->arena_size);
328
329         new_size = PAGE_ALIGN(offset + size);
330         if (new_size < le64toh(f->header->header_size))
331                 new_size = le64toh(f->header->header_size);
332
333         if (new_size <= old_size)
334                 return 0;
335
336         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
337                 return -E2BIG;
338
339         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
340                 struct statvfs svfs;
341
342                 if (fstatvfs(f->fd, &svfs) >= 0) {
343                         uint64_t available;
344
345                         available = svfs.f_bfree * svfs.f_bsize;
346
347                         if (available >= f->metrics.keep_free)
348                                 available -= f->metrics.keep_free;
349                         else
350                                 available = 0;
351
352                         if (new_size - old_size > available)
353                                 return -E2BIG;
354                 }
355         }
356
357         /* Increase by larger blocks at once */
358         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360                 new_size = f->metrics.max_size;
361
362         /* Note that the glibc fallocate() fallback is very
363            inefficient, hence we try to minimize the allocation area
364            as we can. */
365         r = posix_fallocate(f->fd, old_size, new_size - old_size);
366         if (r != 0)
367                 return -r;
368
369         if (fstat(f->fd, &f->last_stat) < 0)
370                 return -errno;
371
372         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
373
374         return 0;
375 }
376
377 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
378         assert(f);
379         assert(ret);
380
381         if (size <= 0)
382                 return -EINVAL;
383
384         /* Avoid SIGBUS on invalid accesses */
385         if (offset + size > (uint64_t) f->last_stat.st_size) {
386                 /* Hmm, out of range? Let's refresh the fstat() data
387                  * first, before we trust that check. */
388
389                 if (fstat(f->fd, &f->last_stat) < 0 ||
390                     offset + size > (uint64_t) f->last_stat.st_size)
391                         return -EADDRNOTAVAIL;
392         }
393
394         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
395 }
396
397 static uint64_t minimum_header_size(Object *o) {
398
399         static const uint64_t table[] = {
400                 [OBJECT_DATA] = sizeof(DataObject),
401                 [OBJECT_FIELD] = sizeof(FieldObject),
402                 [OBJECT_ENTRY] = sizeof(EntryObject),
403                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406                 [OBJECT_TAG] = sizeof(TagObject),
407         };
408
409         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410                 return sizeof(ObjectHeader);
411
412         return table[o->object.type];
413 }
414
415 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
416         int r;
417         void *t;
418         Object *o;
419         uint64_t s;
420
421         assert(f);
422         assert(ret);
423
424         /* Objects may only be located at multiple of 64 bit */
425         if (!VALID64(offset))
426                 return -EFAULT;
427
428
429         r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
430         if (r < 0)
431                 return r;
432
433         o = (Object*) t;
434         s = le64toh(o->object.size);
435
436         if (s < sizeof(ObjectHeader))
437                 return -EBADMSG;
438
439         if (o->object.type <= OBJECT_UNUSED)
440                 return -EBADMSG;
441
442         if (s < minimum_header_size(o))
443                 return -EBADMSG;
444
445         if (type > 0 && o->object.type != type)
446                 return -EBADMSG;
447
448         if (s > sizeof(ObjectHeader)) {
449                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
450                 if (r < 0)
451                         return r;
452
453                 o = (Object*) t;
454         }
455
456         *ret = o;
457         return 0;
458 }
459
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
461         uint64_t r;
462
463         assert(f);
464
465         r = le64toh(f->header->tail_entry_seqnum) + 1;
466
467         if (seqnum) {
468                 /* If an external seqnum counter was passed, we update
469                  * both the local and the external one, and set it to
470                  * the maximum of both */
471
472                 if (*seqnum + 1 > r)
473                         r = *seqnum + 1;
474
475                 *seqnum = r;
476         }
477
478         f->header->tail_entry_seqnum = htole64(r);
479
480         if (f->header->head_entry_seqnum == 0)
481                 f->header->head_entry_seqnum = htole64(r);
482
483         return r;
484 }
485
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
487         int r;
488         uint64_t p;
489         Object *tail, *o;
490         void *t;
491
492         assert(f);
493         assert(type > 0 && type < _OBJECT_TYPE_MAX);
494         assert(size >= sizeof(ObjectHeader));
495         assert(offset);
496         assert(ret);
497
498         r = journal_file_set_online(f);
499         if (r < 0)
500                 return r;
501
502         p = le64toh(f->header->tail_object_offset);
503         if (p == 0)
504                 p = le64toh(f->header->header_size);
505         else {
506                 r = journal_file_move_to_object(f, -1, p, &tail);
507                 if (r < 0)
508                         return r;
509
510                 p += ALIGN64(le64toh(tail->object.size));
511         }
512
513         r = journal_file_allocate(f, p, size);
514         if (r < 0)
515                 return r;
516
517         r = journal_file_move_to(f, type, false, p, size, &t);
518         if (r < 0)
519                 return r;
520
521         o = (Object*) t;
522
523         zero(o->object);
524         o->object.type = type;
525         o->object.size = htole64(size);
526
527         f->header->tail_object_offset = htole64(p);
528         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
529
530         *ret = o;
531         *offset = p;
532
533         return 0;
534 }
535
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
537         uint64_t s, p;
538         Object *o;
539         int r;
540
541         assert(f);
542
543         /* We estimate that we need 1 hash table entry per 768 of
544            journal file and we want to make sure we never get beyond
545            75% fill level. Calculate the hash table size for the
546            maximum file size based on these metrics. */
547
548         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
551
552         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
553
554         r = journal_file_append_object(f,
555                                        OBJECT_DATA_HASH_TABLE,
556                                        offsetof(Object, hash_table.items) + s,
557                                        &o, &p);
558         if (r < 0)
559                 return r;
560
561         memzero(o->hash_table.items, s);
562
563         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564         f->header->data_hash_table_size = htole64(s);
565
566         return 0;
567 }
568
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
570         uint64_t s, p;
571         Object *o;
572         int r;
573
574         assert(f);
575
576         /* We use a fixed size hash table for the fields as this
577          * number should grow very slowly only */
578
579         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580         r = journal_file_append_object(f,
581                                        OBJECT_FIELD_HASH_TABLE,
582                                        offsetof(Object, hash_table.items) + s,
583                                        &o, &p);
584         if (r < 0)
585                 return r;
586
587         memzero(o->hash_table.items, s);
588
589         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590         f->header->field_hash_table_size = htole64(s);
591
592         return 0;
593 }
594
595 static int journal_file_map_data_hash_table(JournalFile *f) {
596         uint64_t s, p;
597         void *t;
598         int r;
599
600         assert(f);
601
602         p = le64toh(f->header->data_hash_table_offset);
603         s = le64toh(f->header->data_hash_table_size);
604
605         r = journal_file_move_to(f,
606                                  OBJECT_DATA_HASH_TABLE,
607                                  true,
608                                  p, s,
609                                  &t);
610         if (r < 0)
611                 return r;
612
613         f->data_hash_table = t;
614         return 0;
615 }
616
617 static int journal_file_map_field_hash_table(JournalFile *f) {
618         uint64_t s, p;
619         void *t;
620         int r;
621
622         assert(f);
623
624         p = le64toh(f->header->field_hash_table_offset);
625         s = le64toh(f->header->field_hash_table_size);
626
627         r = journal_file_move_to(f,
628                                  OBJECT_FIELD_HASH_TABLE,
629                                  true,
630                                  p, s,
631                                  &t);
632         if (r < 0)
633                 return r;
634
635         f->field_hash_table = t;
636         return 0;
637 }
638
639 static int journal_file_link_field(
640                 JournalFile *f,
641                 Object *o,
642                 uint64_t offset,
643                 uint64_t hash) {
644
645         uint64_t p, h;
646         int r;
647
648         assert(f);
649         assert(o);
650         assert(offset > 0);
651
652         if (o->object.type != OBJECT_FIELD)
653                 return -EINVAL;
654
655         /* This might alter the window we are looking at */
656
657         o->field.next_hash_offset = o->field.head_data_offset = 0;
658
659         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->field_hash_table[h].tail_hash_offset);
661         if (p == 0)
662                 f->field_hash_table[h].head_hash_offset = htole64(offset);
663         else {
664                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
665                 if (r < 0)
666                         return r;
667
668                 o->field.next_hash_offset = htole64(offset);
669         }
670
671         f->field_hash_table[h].tail_hash_offset = htole64(offset);
672
673         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
675
676         return 0;
677 }
678
679 static int journal_file_link_data(
680                 JournalFile *f,
681                 Object *o,
682                 uint64_t offset,
683                 uint64_t hash) {
684
685         uint64_t p, h;
686         int r;
687
688         assert(f);
689         assert(o);
690         assert(offset > 0);
691
692         if (o->object.type != OBJECT_DATA)
693                 return -EINVAL;
694
695         /* This might alter the window we are looking at */
696
697         o->data.next_hash_offset = o->data.next_field_offset = 0;
698         o->data.entry_offset = o->data.entry_array_offset = 0;
699         o->data.n_entries = 0;
700
701         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702         p = le64toh(f->data_hash_table[h].tail_hash_offset);
703         if (p == 0)
704                 /* Only entry in the hash table is easy */
705                 f->data_hash_table[h].head_hash_offset = htole64(offset);
706         else {
707                 /* Move back to the previous data object, to patch in
708                  * pointer */
709
710                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
711                 if (r < 0)
712                         return r;
713
714                 o->data.next_hash_offset = htole64(offset);
715         }
716
717         f->data_hash_table[h].tail_hash_offset = htole64(offset);
718
719         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
721
722         return 0;
723 }
724
725 int journal_file_find_field_object_with_hash(
726                 JournalFile *f,
727                 const void *field, uint64_t size, uint64_t hash,
728                 Object **ret, uint64_t *offset) {
729
730         uint64_t p, osize, h;
731         int r;
732
733         assert(f);
734         assert(field && size > 0);
735
736         osize = offsetof(Object, field.payload) + size;
737
738         if (f->header->field_hash_table_size == 0)
739                 return -EBADMSG;
740
741         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742         p = le64toh(f->field_hash_table[h].head_hash_offset);
743
744         while (p > 0) {
745                 Object *o;
746
747                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
748                 if (r < 0)
749                         return r;
750
751                 if (le64toh(o->field.hash) == hash &&
752                     le64toh(o->object.size) == osize &&
753                     memcmp(o->field.payload, field, size) == 0) {
754
755                         if (ret)
756                                 *ret = o;
757                         if (offset)
758                                 *offset = p;
759
760                         return 1;
761                 }
762
763                 p = le64toh(o->field.next_hash_offset);
764         }
765
766         return 0;
767 }
768
769 int journal_file_find_field_object(
770                 JournalFile *f,
771                 const void *field, uint64_t size,
772                 Object **ret, uint64_t *offset) {
773
774         uint64_t hash;
775
776         assert(f);
777         assert(field && size > 0);
778
779         hash = hash64(field, size);
780
781         return journal_file_find_field_object_with_hash(f,
782                                                         field, size, hash,
783                                                         ret, offset);
784 }
785
786 int journal_file_find_data_object_with_hash(
787                 JournalFile *f,
788                 const void *data, uint64_t size, uint64_t hash,
789                 Object **ret, uint64_t *offset) {
790
791         uint64_t p, osize, h;
792         int r;
793
794         assert(f);
795         assert(data || size == 0);
796
797         osize = offsetof(Object, data.payload) + size;
798
799         if (f->header->data_hash_table_size == 0)
800                 return -EBADMSG;
801
802         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803         p = le64toh(f->data_hash_table[h].head_hash_offset);
804
805         while (p > 0) {
806                 Object *o;
807
808                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
809                 if (r < 0)
810                         return r;
811
812                 if (le64toh(o->data.hash) != hash)
813                         goto next;
814
815                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
816 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
817                         uint64_t l;
818                         size_t rsize;
819
820                         l = le64toh(o->object.size);
821                         if (l <= offsetof(Object, data.payload))
822                                 return -EBADMSG;
823
824                         l -= offsetof(Object, data.payload);
825
826                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
827                                             o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
828                         if (r < 0)
829                                 return r;
830
831                         if (rsize == size &&
832                             memcmp(f->compress_buffer, data, size) == 0) {
833
834                                 if (ret)
835                                         *ret = o;
836
837                                 if (offset)
838                                         *offset = p;
839
840                                 return 1;
841                         }
842 #else
843                         return -EPROTONOSUPPORT;
844 #endif
845                 } else if (le64toh(o->object.size) == osize &&
846                            memcmp(o->data.payload, data, size) == 0) {
847
848                         if (ret)
849                                 *ret = o;
850
851                         if (offset)
852                                 *offset = p;
853
854                         return 1;
855                 }
856
857         next:
858                 p = le64toh(o->data.next_hash_offset);
859         }
860
861         return 0;
862 }
863
864 int journal_file_find_data_object(
865                 JournalFile *f,
866                 const void *data, uint64_t size,
867                 Object **ret, uint64_t *offset) {
868
869         uint64_t hash;
870
871         assert(f);
872         assert(data || size == 0);
873
874         hash = hash64(data, size);
875
876         return journal_file_find_data_object_with_hash(f,
877                                                        data, size, hash,
878                                                        ret, offset);
879 }
880
881 static int journal_file_append_field(
882                 JournalFile *f,
883                 const void *field, uint64_t size,
884                 Object **ret, uint64_t *offset) {
885
886         uint64_t hash, p;
887         uint64_t osize;
888         Object *o;
889         int r;
890
891         assert(f);
892         assert(field && size > 0);
893
894         hash = hash64(field, size);
895
896         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
897         if (r < 0)
898                 return r;
899         else if (r > 0) {
900
901                 if (ret)
902                         *ret = o;
903
904                 if (offset)
905                         *offset = p;
906
907                 return 0;
908         }
909
910         osize = offsetof(Object, field.payload) + size;
911         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
912         if (r < 0)
913                 return r;
914
915         o->field.hash = htole64(hash);
916         memcpy(o->field.payload, field, size);
917
918         r = journal_file_link_field(f, o, p, hash);
919         if (r < 0)
920                 return r;
921
922         /* The linking might have altered the window, so let's
923          * refresh our pointer */
924         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
925         if (r < 0)
926                 return r;
927
928 #ifdef HAVE_GCRYPT
929         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
930         if (r < 0)
931                 return r;
932 #endif
933
934         if (ret)
935                 *ret = o;
936
937         if (offset)
938                 *offset = p;
939
940         return 0;
941 }
942
943 static int journal_file_append_data(
944                 JournalFile *f,
945                 const void *data, uint64_t size,
946                 Object **ret, uint64_t *offset) {
947
948         uint64_t hash, p;
949         uint64_t osize;
950         Object *o;
951         int r, compression = 0;
952         const void *eq;
953
954         assert(f);
955         assert(data || size == 0);
956
957         hash = hash64(data, size);
958
959         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
960         if (r < 0)
961                 return r;
962         else if (r > 0) {
963
964                 if (ret)
965                         *ret = o;
966
967                 if (offset)
968                         *offset = p;
969
970                 return 0;
971         }
972
973         osize = offsetof(Object, data.payload) + size;
974         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
975         if (r < 0)
976                 return r;
977
978         o->data.hash = htole64(hash);
979
980 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
981         if (f->compress_xz &&
982             size >= COMPRESSION_SIZE_THRESHOLD) {
983                 size_t rsize;
984
985                 compression = compress_blob(data, size, o->data.payload, &rsize);
986
987                 if (compression) {
988                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
989                         o->object.flags |= compression;
990
991                         log_debug("Compressed data object %"PRIu64" -> %zu using %s",
992                                   size, rsize, object_compressed_to_string(compression));
993                 }
994         }
995 #endif
996
997         if (!compression && size > 0)
998                 memcpy(o->data.payload, data, size);
999
1000         r = journal_file_link_data(f, o, p, hash);
1001         if (r < 0)
1002                 return r;
1003
1004         /* The linking might have altered the window, so let's
1005          * refresh our pointer */
1006         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1007         if (r < 0)
1008                 return r;
1009
1010         if (!data)
1011                 eq = NULL;
1012         else
1013                 eq = memchr(data, '=', size);
1014         if (eq && eq > data) {
1015                 Object *fo = NULL;
1016                 uint64_t fp;
1017
1018                 /* Create field object ... */
1019                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1020                 if (r < 0)
1021                         return r;
1022
1023                 /* ... and link it in. */
1024                 o->data.next_field_offset = fo->field.head_data_offset;
1025                 fo->field.head_data_offset = le64toh(p);
1026         }
1027
1028 #ifdef HAVE_GCRYPT
1029         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1030         if (r < 0)
1031                 return r;
1032 #endif
1033
1034         if (ret)
1035                 *ret = o;
1036
1037         if (offset)
1038                 *offset = p;
1039
1040         return 0;
1041 }
1042
1043 uint64_t journal_file_entry_n_items(Object *o) {
1044         assert(o);
1045
1046         if (o->object.type != OBJECT_ENTRY)
1047                 return 0;
1048
1049         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1050 }
1051
1052 uint64_t journal_file_entry_array_n_items(Object *o) {
1053         assert(o);
1054
1055         if (o->object.type != OBJECT_ENTRY_ARRAY)
1056                 return 0;
1057
1058         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1059 }
1060
1061 uint64_t journal_file_hash_table_n_items(Object *o) {
1062         assert(o);
1063
1064         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1065             o->object.type != OBJECT_FIELD_HASH_TABLE)
1066                 return 0;
1067
1068         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1069 }
1070
1071 static int link_entry_into_array(JournalFile *f,
1072                                  le64_t *first,
1073                                  le64_t *idx,
1074                                  uint64_t p) {
1075         int r;
1076         uint64_t n = 0, ap = 0, q, i, a, hidx;
1077         Object *o;
1078
1079         assert(f);
1080         assert(first);
1081         assert(idx);
1082         assert(p > 0);
1083
1084         a = le64toh(*first);
1085         i = hidx = le64toh(*idx);
1086         while (a > 0) {
1087
1088                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1089                 if (r < 0)
1090                         return r;
1091
1092                 n = journal_file_entry_array_n_items(o);
1093                 if (i < n) {
1094                         o->entry_array.items[i] = htole64(p);
1095                         *idx = htole64(hidx + 1);
1096                         return 0;
1097                 }
1098
1099                 i -= n;
1100                 ap = a;
1101                 a = le64toh(o->entry_array.next_entry_array_offset);
1102         }
1103
1104         if (hidx > n)
1105                 n = (hidx+1) * 2;
1106         else
1107                 n = n * 2;
1108
1109         if (n < 4)
1110                 n = 4;
1111
1112         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1113                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1114                                        &o, &q);
1115         if (r < 0)
1116                 return r;
1117
1118 #ifdef HAVE_GCRYPT
1119         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1120         if (r < 0)
1121                 return r;
1122 #endif
1123
1124         o->entry_array.items[i] = htole64(p);
1125
1126         if (ap == 0)
1127                 *first = htole64(q);
1128         else {
1129                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1130                 if (r < 0)
1131                         return r;
1132
1133                 o->entry_array.next_entry_array_offset = htole64(q);
1134         }
1135
1136         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1137                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1138
1139         *idx = htole64(hidx + 1);
1140
1141         return 0;
1142 }
1143
1144 static int link_entry_into_array_plus_one(JournalFile *f,
1145                                           le64_t *extra,
1146                                           le64_t *first,
1147                                           le64_t *idx,
1148                                           uint64_t p) {
1149
1150         int r;
1151
1152         assert(f);
1153         assert(extra);
1154         assert(first);
1155         assert(idx);
1156         assert(p > 0);
1157
1158         if (*idx == 0)
1159                 *extra = htole64(p);
1160         else {
1161                 le64_t i;
1162
1163                 i = htole64(le64toh(*idx) - 1);
1164                 r = link_entry_into_array(f, first, &i, p);
1165                 if (r < 0)
1166                         return r;
1167         }
1168
1169         *idx = htole64(le64toh(*idx) + 1);
1170         return 0;
1171 }
1172
1173 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1174         uint64_t p;
1175         int r;
1176         assert(f);
1177         assert(o);
1178         assert(offset > 0);
1179
1180         p = le64toh(o->entry.items[i].object_offset);
1181         if (p == 0)
1182                 return -EINVAL;
1183
1184         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1185         if (r < 0)
1186                 return r;
1187
1188         return link_entry_into_array_plus_one(f,
1189                                               &o->data.entry_offset,
1190                                               &o->data.entry_array_offset,
1191                                               &o->data.n_entries,
1192                                               offset);
1193 }
1194
1195 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1196         uint64_t n, i;
1197         int r;
1198
1199         assert(f);
1200         assert(o);
1201         assert(offset > 0);
1202
1203         if (o->object.type != OBJECT_ENTRY)
1204                 return -EINVAL;
1205
1206         __sync_synchronize();
1207
1208         /* Link up the entry itself */
1209         r = link_entry_into_array(f,
1210                                   &f->header->entry_array_offset,
1211                                   &f->header->n_entries,
1212                                   offset);
1213         if (r < 0)
1214                 return r;
1215
1216         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1217
1218         if (f->header->head_entry_realtime == 0)
1219                 f->header->head_entry_realtime = o->entry.realtime;
1220
1221         f->header->tail_entry_realtime = o->entry.realtime;
1222         f->header->tail_entry_monotonic = o->entry.monotonic;
1223
1224         f->tail_entry_monotonic_valid = true;
1225
1226         /* Link up the items */
1227         n = journal_file_entry_n_items(o);
1228         for (i = 0; i < n; i++) {
1229                 r = journal_file_link_entry_item(f, o, offset, i);
1230                 if (r < 0)
1231                         return r;
1232         }
1233
1234         return 0;
1235 }
1236
1237 static int journal_file_append_entry_internal(
1238                 JournalFile *f,
1239                 const dual_timestamp *ts,
1240                 uint64_t xor_hash,
1241                 const EntryItem items[], unsigned n_items,
1242                 uint64_t *seqnum,
1243                 Object **ret, uint64_t *offset) {
1244         uint64_t np;
1245         uint64_t osize;
1246         Object *o;
1247         int r;
1248
1249         assert(f);
1250         assert(items || n_items == 0);
1251         assert(ts);
1252
1253         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1254
1255         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1256         if (r < 0)
1257                 return r;
1258
1259         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1260         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1261         o->entry.realtime = htole64(ts->realtime);
1262         o->entry.monotonic = htole64(ts->monotonic);
1263         o->entry.xor_hash = htole64(xor_hash);
1264         o->entry.boot_id = f->header->boot_id;
1265
1266 #ifdef HAVE_GCRYPT
1267         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1268         if (r < 0)
1269                 return r;
1270 #endif
1271
1272         r = journal_file_link_entry(f, o, np);
1273         if (r < 0)
1274                 return r;
1275
1276         if (ret)
1277                 *ret = o;
1278
1279         if (offset)
1280                 *offset = np;
1281
1282         return 0;
1283 }
1284
1285 void journal_file_post_change(JournalFile *f) {
1286         assert(f);
1287
1288         /* inotify() does not receive IN_MODIFY events from file
1289          * accesses done via mmap(). After each access we hence
1290          * trigger IN_MODIFY by truncating the journal file to its
1291          * current size which triggers IN_MODIFY. */
1292
1293         __sync_synchronize();
1294
1295         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1296                 log_error("Failed to truncate file to its own size: %m");
1297 }
1298
1299 static int entry_item_cmp(const void *_a, const void *_b) {
1300         const EntryItem *a = _a, *b = _b;
1301
1302         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1303                 return -1;
1304         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1305                 return 1;
1306         return 0;
1307 }
1308
1309 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1310         unsigned i;
1311         EntryItem *items;
1312         int r;
1313         uint64_t xor_hash = 0;
1314         struct dual_timestamp _ts;
1315
1316         assert(f);
1317         assert(iovec || n_iovec == 0);
1318
1319         if (!ts) {
1320                 dual_timestamp_get(&_ts);
1321                 ts = &_ts;
1322         }
1323
1324         if (f->tail_entry_monotonic_valid &&
1325             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1326                 return -EINVAL;
1327
1328 #ifdef HAVE_GCRYPT
1329         r = journal_file_maybe_append_tag(f, ts->realtime);
1330         if (r < 0)
1331                 return r;
1332 #endif
1333
1334         /* alloca() can't take 0, hence let's allocate at least one */
1335         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1336
1337         for (i = 0; i < n_iovec; i++) {
1338                 uint64_t p;
1339                 Object *o;
1340
1341                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1342                 if (r < 0)
1343                         return r;
1344
1345                 xor_hash ^= le64toh(o->data.hash);
1346                 items[i].object_offset = htole64(p);
1347                 items[i].hash = o->data.hash;
1348         }
1349
1350         /* Order by the position on disk, in order to improve seek
1351          * times for rotating media. */
1352         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1353
1354         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1355
1356         journal_file_post_change(f);
1357
1358         return r;
1359 }
1360
1361 typedef struct ChainCacheItem {
1362         uint64_t first; /* the array at the beginning of the chain */
1363         uint64_t array; /* the cached array */
1364         uint64_t begin; /* the first item in the cached array */
1365         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1366         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1367 } ChainCacheItem;
1368
1369 static void chain_cache_put(
1370                 Hashmap *h,
1371                 ChainCacheItem *ci,
1372                 uint64_t first,
1373                 uint64_t array,
1374                 uint64_t begin,
1375                 uint64_t total,
1376                 uint64_t last_index) {
1377
1378         if (!ci) {
1379                 /* If the chain item to cache for this chain is the
1380                  * first one it's not worth caching anything */
1381                 if (array == first)
1382                         return;
1383
1384                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1385                         ci = hashmap_steal_first(h);
1386                 else {
1387                         ci = new(ChainCacheItem, 1);
1388                         if (!ci)
1389                                 return;
1390                 }
1391
1392                 ci->first = first;
1393
1394                 if (hashmap_put(h, &ci->first, ci) < 0) {
1395                         free(ci);
1396                         return;
1397                 }
1398         } else
1399                 assert(ci->first == first);
1400
1401         ci->array = array;
1402         ci->begin = begin;
1403         ci->total = total;
1404         ci->last_index = last_index;
1405 }
1406
1407 static int generic_array_get(
1408                 JournalFile *f,
1409                 uint64_t first,
1410                 uint64_t i,
1411                 Object **ret, uint64_t *offset) {
1412
1413         Object *o;
1414         uint64_t p = 0, a, t = 0;
1415         int r;
1416         ChainCacheItem *ci;
1417
1418         assert(f);
1419
1420         a = first;
1421
1422         /* Try the chain cache first */
1423         ci = hashmap_get(f->chain_cache, &first);
1424         if (ci && i > ci->total) {
1425                 a = ci->array;
1426                 i -= ci->total;
1427                 t = ci->total;
1428         }
1429
1430         while (a > 0) {
1431                 uint64_t k;
1432
1433                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1434                 if (r < 0)
1435                         return r;
1436
1437                 k = journal_file_entry_array_n_items(o);
1438                 if (i < k) {
1439                         p = le64toh(o->entry_array.items[i]);
1440                         goto found;
1441                 }
1442
1443                 i -= k;
1444                 t += k;
1445                 a = le64toh(o->entry_array.next_entry_array_offset);
1446         }
1447
1448         return 0;
1449
1450 found:
1451         /* Let's cache this item for the next invocation */
1452         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1453
1454         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455         if (r < 0)
1456                 return r;
1457
1458         if (ret)
1459                 *ret = o;
1460
1461         if (offset)
1462                 *offset = p;
1463
1464         return 1;
1465 }
1466
1467 static int generic_array_get_plus_one(
1468                 JournalFile *f,
1469                 uint64_t extra,
1470                 uint64_t first,
1471                 uint64_t i,
1472                 Object **ret, uint64_t *offset) {
1473
1474         Object *o;
1475
1476         assert(f);
1477
1478         if (i == 0) {
1479                 int r;
1480
1481                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1482                 if (r < 0)
1483                         return r;
1484
1485                 if (ret)
1486                         *ret = o;
1487
1488                 if (offset)
1489                         *offset = extra;
1490
1491                 return 1;
1492         }
1493
1494         return generic_array_get(f, first, i-1, ret, offset);
1495 }
1496
1497 enum {
1498         TEST_FOUND,
1499         TEST_LEFT,
1500         TEST_RIGHT
1501 };
1502
1503 static int generic_array_bisect(
1504                 JournalFile *f,
1505                 uint64_t first,
1506                 uint64_t n,
1507                 uint64_t needle,
1508                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509                 direction_t direction,
1510                 Object **ret,
1511                 uint64_t *offset,
1512                 uint64_t *idx) {
1513
1514         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1515         bool subtract_one = false;
1516         Object *o, *array = NULL;
1517         int r;
1518         ChainCacheItem *ci;
1519
1520         assert(f);
1521         assert(test_object);
1522
1523         /* Start with the first array in the chain */
1524         a = first;
1525
1526         ci = hashmap_get(f->chain_cache, &first);
1527         if (ci && n > ci->total) {
1528                 /* Ah, we have iterated this bisection array chain
1529                  * previously! Let's see if we can skip ahead in the
1530                  * chain, as far as the last time. But we can't jump
1531                  * backwards in the chain, so let's check that
1532                  * first. */
1533
1534                 r = test_object(f, ci->begin, needle);
1535                 if (r < 0)
1536                         return r;
1537
1538                 if (r == TEST_LEFT) {
1539                         /* OK, what we are looking for is right of the
1540                          * begin of this EntryArray, so let's jump
1541                          * straight to previously cached array in the
1542                          * chain */
1543
1544                         a = ci->array;
1545                         n -= ci->total;
1546                         t = ci->total;
1547                         last_index = ci->last_index;
1548                 }
1549         }
1550
1551         while (a > 0) {
1552                 uint64_t left, right, k, lp;
1553
1554                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1555                 if (r < 0)
1556                         return r;
1557
1558                 k = journal_file_entry_array_n_items(array);
1559                 right = MIN(k, n);
1560                 if (right <= 0)
1561                         return 0;
1562
1563                 i = right - 1;
1564                 lp = p = le64toh(array->entry_array.items[i]);
1565                 if (p <= 0)
1566                         return -EBADMSG;
1567
1568                 r = test_object(f, p, needle);
1569                 if (r < 0)
1570                         return r;
1571
1572                 if (r == TEST_FOUND)
1573                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574
1575                 if (r == TEST_RIGHT) {
1576                         left = 0;
1577                         right -= 1;
1578
1579                         if (last_index != (uint64_t) -1) {
1580                                 assert(last_index <= right);
1581
1582                                 /* If we cached the last index we
1583                                  * looked at, let's try to not to jump
1584                                  * too wildly around and see if we can
1585                                  * limit the range to look at early to
1586                                  * the immediate neighbors of the last
1587                                  * index we looked at. */
1588
1589                                 if (last_index > 0) {
1590                                         uint64_t x = last_index - 1;
1591
1592                                         p = le64toh(array->entry_array.items[x]);
1593                                         if (p <= 0)
1594                                                 return -EBADMSG;
1595
1596                                         r = test_object(f, p, needle);
1597                                         if (r < 0)
1598                                                 return r;
1599
1600                                         if (r == TEST_FOUND)
1601                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602
1603                                         if (r == TEST_RIGHT)
1604                                                 right = x;
1605                                         else
1606                                                 left = x + 1;
1607                                 }
1608
1609                                 if (last_index < right) {
1610                                         uint64_t y = last_index + 1;
1611
1612                                         p = le64toh(array->entry_array.items[y]);
1613                                         if (p <= 0)
1614                                                 return -EBADMSG;
1615
1616                                         r = test_object(f, p, needle);
1617                                         if (r < 0)
1618                                                 return r;
1619
1620                                         if (r == TEST_FOUND)
1621                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622
1623                                         if (r == TEST_RIGHT)
1624                                                 right = y;
1625                                         else
1626                                                 left = y + 1;
1627                                 }
1628                         }
1629
1630                         for (;;) {
1631                                 if (left == right) {
1632                                         if (direction == DIRECTION_UP)
1633                                                 subtract_one = true;
1634
1635                                         i = left;
1636                                         goto found;
1637                                 }
1638
1639                                 assert(left < right);
1640                                 i = (left + right) / 2;
1641
1642                                 p = le64toh(array->entry_array.items[i]);
1643                                 if (p <= 0)
1644                                         return -EBADMSG;
1645
1646                                 r = test_object(f, p, needle);
1647                                 if (r < 0)
1648                                         return r;
1649
1650                                 if (r == TEST_FOUND)
1651                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1652
1653                                 if (r == TEST_RIGHT)
1654                                         right = i;
1655                                 else
1656                                         left = i + 1;
1657                         }
1658                 }
1659
1660                 if (k > n) {
1661                         if (direction == DIRECTION_UP) {
1662                                 i = n;
1663                                 subtract_one = true;
1664                                 goto found;
1665                         }
1666
1667                         return 0;
1668                 }
1669
1670                 last_p = lp;
1671
1672                 n -= k;
1673                 t += k;
1674                 last_index = (uint64_t) -1;
1675                 a = le64toh(array->entry_array.next_entry_array_offset);
1676         }
1677
1678         return 0;
1679
1680 found:
1681         if (subtract_one && t == 0 && i == 0)
1682                 return 0;
1683
1684         /* Let's cache this item for the next invocation */
1685         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1686
1687         if (subtract_one && i == 0)
1688                 p = last_p;
1689         else if (subtract_one)
1690                 p = le64toh(array->entry_array.items[i-1]);
1691         else
1692                 p = le64toh(array->entry_array.items[i]);
1693
1694         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1695         if (r < 0)
1696                 return r;
1697
1698         if (ret)
1699                 *ret = o;
1700
1701         if (offset)
1702                 *offset = p;
1703
1704         if (idx)
1705                 *idx = t + i + (subtract_one ? -1 : 0);
1706
1707         return 1;
1708 }
1709
1710
1711 static int generic_array_bisect_plus_one(
1712                 JournalFile *f,
1713                 uint64_t extra,
1714                 uint64_t first,
1715                 uint64_t n,
1716                 uint64_t needle,
1717                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1718                 direction_t direction,
1719                 Object **ret,
1720                 uint64_t *offset,
1721                 uint64_t *idx) {
1722
1723         int r;
1724         bool step_back = false;
1725         Object *o;
1726
1727         assert(f);
1728         assert(test_object);
1729
1730         if (n <= 0)
1731                 return 0;
1732
1733         /* This bisects the array in object 'first', but first checks
1734          * an extra  */
1735         r = test_object(f, extra, needle);
1736         if (r < 0)
1737                 return r;
1738
1739         if (r == TEST_FOUND)
1740                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1741
1742         /* if we are looking with DIRECTION_UP then we need to first
1743            see if in the actual array there is a matching entry, and
1744            return the last one of that. But if there isn't any we need
1745            to return this one. Hence remember this, and return it
1746            below. */
1747         if (r == TEST_LEFT)
1748                 step_back = direction == DIRECTION_UP;
1749
1750         if (r == TEST_RIGHT) {
1751                 if (direction == DIRECTION_DOWN)
1752                         goto found;
1753                 else
1754                         return 0;
1755         }
1756
1757         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1758
1759         if (r == 0 && step_back)
1760                 goto found;
1761
1762         if (r > 0 && idx)
1763                 (*idx) ++;
1764
1765         return r;
1766
1767 found:
1768         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1769         if (r < 0)
1770                 return r;
1771
1772         if (ret)
1773                 *ret = o;
1774
1775         if (offset)
1776                 *offset = extra;
1777
1778         if (idx)
1779                 *idx = 0;
1780
1781         return 1;
1782 }
1783
1784 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1785         assert(f);
1786         assert(p > 0);
1787
1788         if (p == needle)
1789                 return TEST_FOUND;
1790         else if (p < needle)
1791                 return TEST_LEFT;
1792         else
1793                 return TEST_RIGHT;
1794 }
1795
1796 int journal_file_move_to_entry_by_offset(
1797                 JournalFile *f,
1798                 uint64_t p,
1799                 direction_t direction,
1800                 Object **ret,
1801                 uint64_t *offset) {
1802
1803         return generic_array_bisect(f,
1804                                     le64toh(f->header->entry_array_offset),
1805                                     le64toh(f->header->n_entries),
1806                                     p,
1807                                     test_object_offset,
1808                                     direction,
1809                                     ret, offset, NULL);
1810 }
1811
1812
1813 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1814         Object *o;
1815         int r;
1816
1817         assert(f);
1818         assert(p > 0);
1819
1820         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1821         if (r < 0)
1822                 return r;
1823
1824         if (le64toh(o->entry.seqnum) == needle)
1825                 return TEST_FOUND;
1826         else if (le64toh(o->entry.seqnum) < needle)
1827                 return TEST_LEFT;
1828         else
1829                 return TEST_RIGHT;
1830 }
1831
1832 int journal_file_move_to_entry_by_seqnum(
1833                 JournalFile *f,
1834                 uint64_t seqnum,
1835                 direction_t direction,
1836                 Object **ret,
1837                 uint64_t *offset) {
1838
1839         return generic_array_bisect(f,
1840                                     le64toh(f->header->entry_array_offset),
1841                                     le64toh(f->header->n_entries),
1842                                     seqnum,
1843                                     test_object_seqnum,
1844                                     direction,
1845                                     ret, offset, NULL);
1846 }
1847
1848 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1849         Object *o;
1850         int r;
1851
1852         assert(f);
1853         assert(p > 0);
1854
1855         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1856         if (r < 0)
1857                 return r;
1858
1859         if (le64toh(o->entry.realtime) == needle)
1860                 return TEST_FOUND;
1861         else if (le64toh(o->entry.realtime) < needle)
1862                 return TEST_LEFT;
1863         else
1864                 return TEST_RIGHT;
1865 }
1866
1867 int journal_file_move_to_entry_by_realtime(
1868                 JournalFile *f,
1869                 uint64_t realtime,
1870                 direction_t direction,
1871                 Object **ret,
1872                 uint64_t *offset) {
1873
1874         return generic_array_bisect(f,
1875                                     le64toh(f->header->entry_array_offset),
1876                                     le64toh(f->header->n_entries),
1877                                     realtime,
1878                                     test_object_realtime,
1879                                     direction,
1880                                     ret, offset, NULL);
1881 }
1882
1883 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1884         Object *o;
1885         int r;
1886
1887         assert(f);
1888         assert(p > 0);
1889
1890         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1891         if (r < 0)
1892                 return r;
1893
1894         if (le64toh(o->entry.monotonic) == needle)
1895                 return TEST_FOUND;
1896         else if (le64toh(o->entry.monotonic) < needle)
1897                 return TEST_LEFT;
1898         else
1899                 return TEST_RIGHT;
1900 }
1901
1902 static inline int find_data_object_by_boot_id(
1903                 JournalFile *f,
1904                 sd_id128_t boot_id,
1905                 Object **o,
1906                 uint64_t *b) {
1907         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1908
1909         sd_id128_to_string(boot_id, t + 9);
1910         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1911 }
1912
1913 int journal_file_move_to_entry_by_monotonic(
1914                 JournalFile *f,
1915                 sd_id128_t boot_id,
1916                 uint64_t monotonic,
1917                 direction_t direction,
1918                 Object **ret,
1919                 uint64_t *offset) {
1920
1921         Object *o;
1922         int r;
1923
1924         assert(f);
1925
1926         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1927         if (r < 0)
1928                 return r;
1929         if (r == 0)
1930                 return -ENOENT;
1931
1932         return generic_array_bisect_plus_one(f,
1933                                              le64toh(o->data.entry_offset),
1934                                              le64toh(o->data.entry_array_offset),
1935                                              le64toh(o->data.n_entries),
1936                                              monotonic,
1937                                              test_object_monotonic,
1938                                              direction,
1939                                              ret, offset, NULL);
1940 }
1941
1942 int journal_file_next_entry(
1943                 JournalFile *f,
1944                 Object *o, uint64_t p,
1945                 direction_t direction,
1946                 Object **ret, uint64_t *offset) {
1947
1948         uint64_t i, n, ofs;
1949         int r;
1950
1951         assert(f);
1952         assert(p > 0 || !o);
1953
1954         n = le64toh(f->header->n_entries);
1955         if (n <= 0)
1956                 return 0;
1957
1958         if (!o)
1959                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1960         else {
1961                 if (o->object.type != OBJECT_ENTRY)
1962                         return -EINVAL;
1963
1964                 r = generic_array_bisect(f,
1965                                          le64toh(f->header->entry_array_offset),
1966                                          le64toh(f->header->n_entries),
1967                                          p,
1968                                          test_object_offset,
1969                                          DIRECTION_DOWN,
1970                                          NULL, NULL,
1971                                          &i);
1972                 if (r <= 0)
1973                         return r;
1974
1975                 if (direction == DIRECTION_DOWN) {
1976                         if (i >= n - 1)
1977                                 return 0;
1978
1979                         i++;
1980                 } else {
1981                         if (i <= 0)
1982                                 return 0;
1983
1984                         i--;
1985                 }
1986         }
1987
1988         /* And jump to it */
1989         r = generic_array_get(f,
1990                               le64toh(f->header->entry_array_offset),
1991                               i,
1992                               ret, &ofs);
1993         if (r <= 0)
1994                 return r;
1995
1996         if (p > 0 &&
1997             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1998                 log_debug("%s: entry array corrupted at entry %"PRIu64,
1999                           f->path, i);
2000                 return -EBADMSG;
2001         }
2002
2003         if (offset)
2004                 *offset = ofs;
2005
2006         return 1;
2007 }
2008
2009 int journal_file_skip_entry(
2010                 JournalFile *f,
2011                 Object *o, uint64_t p,
2012                 int64_t skip,
2013                 Object **ret, uint64_t *offset) {
2014
2015         uint64_t i, n;
2016         int r;
2017
2018         assert(f);
2019         assert(o);
2020         assert(p > 0);
2021
2022         if (o->object.type != OBJECT_ENTRY)
2023                 return -EINVAL;
2024
2025         r = generic_array_bisect(f,
2026                                  le64toh(f->header->entry_array_offset),
2027                                  le64toh(f->header->n_entries),
2028                                  p,
2029                                  test_object_offset,
2030                                  DIRECTION_DOWN,
2031                                  NULL, NULL,
2032                                  &i);
2033         if (r <= 0)
2034                 return r;
2035
2036         /* Calculate new index */
2037         if (skip < 0) {
2038                 if ((uint64_t) -skip >= i)
2039                         i = 0;
2040                 else
2041                         i = i - (uint64_t) -skip;
2042         } else
2043                 i  += (uint64_t) skip;
2044
2045         n = le64toh(f->header->n_entries);
2046         if (n <= 0)
2047                 return -EBADMSG;
2048
2049         if (i >= n)
2050                 i = n-1;
2051
2052         return generic_array_get(f,
2053                                  le64toh(f->header->entry_array_offset),
2054                                  i,
2055                                  ret, offset);
2056 }
2057
2058 int journal_file_next_entry_for_data(
2059                 JournalFile *f,
2060                 Object *o, uint64_t p,
2061                 uint64_t data_offset,
2062                 direction_t direction,
2063                 Object **ret, uint64_t *offset) {
2064
2065         uint64_t n, i;
2066         int r;
2067         Object *d;
2068
2069         assert(f);
2070         assert(p > 0 || !o);
2071
2072         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2073         if (r < 0)
2074                 return r;
2075
2076         n = le64toh(d->data.n_entries);
2077         if (n <= 0)
2078                 return n;
2079
2080         if (!o)
2081                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2082         else {
2083                 if (o->object.type != OBJECT_ENTRY)
2084                         return -EINVAL;
2085
2086                 r = generic_array_bisect_plus_one(f,
2087                                                   le64toh(d->data.entry_offset),
2088                                                   le64toh(d->data.entry_array_offset),
2089                                                   le64toh(d->data.n_entries),
2090                                                   p,
2091                                                   test_object_offset,
2092                                                   DIRECTION_DOWN,
2093                                                   NULL, NULL,
2094                                                   &i);
2095
2096                 if (r <= 0)
2097                         return r;
2098
2099                 if (direction == DIRECTION_DOWN) {
2100                         if (i >= n - 1)
2101                                 return 0;
2102
2103                         i++;
2104                 } else {
2105                         if (i <= 0)
2106                                 return 0;
2107
2108                         i--;
2109                 }
2110
2111         }
2112
2113         return generic_array_get_plus_one(f,
2114                                           le64toh(d->data.entry_offset),
2115                                           le64toh(d->data.entry_array_offset),
2116                                           i,
2117                                           ret, offset);
2118 }
2119
2120 int journal_file_move_to_entry_by_offset_for_data(
2121                 JournalFile *f,
2122                 uint64_t data_offset,
2123                 uint64_t p,
2124                 direction_t direction,
2125                 Object **ret, uint64_t *offset) {
2126
2127         int r;
2128         Object *d;
2129
2130         assert(f);
2131
2132         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2133         if (r < 0)
2134                 return r;
2135
2136         return generic_array_bisect_plus_one(f,
2137                                              le64toh(d->data.entry_offset),
2138                                              le64toh(d->data.entry_array_offset),
2139                                              le64toh(d->data.n_entries),
2140                                              p,
2141                                              test_object_offset,
2142                                              direction,
2143                                              ret, offset, NULL);
2144 }
2145
2146 int journal_file_move_to_entry_by_monotonic_for_data(
2147                 JournalFile *f,
2148                 uint64_t data_offset,
2149                 sd_id128_t boot_id,
2150                 uint64_t monotonic,
2151                 direction_t direction,
2152                 Object **ret, uint64_t *offset) {
2153
2154         Object *o, *d;
2155         int r;
2156         uint64_t b, z;
2157
2158         assert(f);
2159
2160         /* First, seek by time */
2161         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2162         if (r < 0)
2163                 return r;
2164         if (r == 0)
2165                 return -ENOENT;
2166
2167         r = generic_array_bisect_plus_one(f,
2168                                           le64toh(o->data.entry_offset),
2169                                           le64toh(o->data.entry_array_offset),
2170                                           le64toh(o->data.n_entries),
2171                                           monotonic,
2172                                           test_object_monotonic,
2173                                           direction,
2174                                           NULL, &z, NULL);
2175         if (r <= 0)
2176                 return r;
2177
2178         /* And now, continue seeking until we find an entry that
2179          * exists in both bisection arrays */
2180
2181         for (;;) {
2182                 Object *qo;
2183                 uint64_t p, q;
2184
2185                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2186                 if (r < 0)
2187                         return r;
2188
2189                 r = generic_array_bisect_plus_one(f,
2190                                                   le64toh(d->data.entry_offset),
2191                                                   le64toh(d->data.entry_array_offset),
2192                                                   le64toh(d->data.n_entries),
2193                                                   z,
2194                                                   test_object_offset,
2195                                                   direction,
2196                                                   NULL, &p, NULL);
2197                 if (r <= 0)
2198                         return r;
2199
2200                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2201                 if (r < 0)
2202                         return r;
2203
2204                 r = generic_array_bisect_plus_one(f,
2205                                                   le64toh(o->data.entry_offset),
2206                                                   le64toh(o->data.entry_array_offset),
2207                                                   le64toh(o->data.n_entries),
2208                                                   p,
2209                                                   test_object_offset,
2210                                                   direction,
2211                                                   &qo, &q, NULL);
2212
2213                 if (r <= 0)
2214                         return r;
2215
2216                 if (p == q) {
2217                         if (ret)
2218                                 *ret = qo;
2219                         if (offset)
2220                                 *offset = q;
2221
2222                         return 1;
2223                 }
2224
2225                 z = q;
2226         }
2227 }
2228
2229 int journal_file_move_to_entry_by_seqnum_for_data(
2230                 JournalFile *f,
2231                 uint64_t data_offset,
2232                 uint64_t seqnum,
2233                 direction_t direction,
2234                 Object **ret, uint64_t *offset) {
2235
2236         Object *d;
2237         int r;
2238
2239         assert(f);
2240
2241         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2242         if (r < 0)
2243                 return r;
2244
2245         return generic_array_bisect_plus_one(f,
2246                                              le64toh(d->data.entry_offset),
2247                                              le64toh(d->data.entry_array_offset),
2248                                              le64toh(d->data.n_entries),
2249                                              seqnum,
2250                                              test_object_seqnum,
2251                                              direction,
2252                                              ret, offset, NULL);
2253 }
2254
2255 int journal_file_move_to_entry_by_realtime_for_data(
2256                 JournalFile *f,
2257                 uint64_t data_offset,
2258                 uint64_t realtime,
2259                 direction_t direction,
2260                 Object **ret, uint64_t *offset) {
2261
2262         Object *d;
2263         int r;
2264
2265         assert(f);
2266
2267         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2268         if (r < 0)
2269                 return r;
2270
2271         return generic_array_bisect_plus_one(f,
2272                                              le64toh(d->data.entry_offset),
2273                                              le64toh(d->data.entry_array_offset),
2274                                              le64toh(d->data.n_entries),
2275                                              realtime,
2276                                              test_object_realtime,
2277                                              direction,
2278                                              ret, offset, NULL);
2279 }
2280
2281 void journal_file_dump(JournalFile *f) {
2282         Object *o;
2283         int r;
2284         uint64_t p;
2285
2286         assert(f);
2287
2288         journal_file_print_header(f);
2289
2290         p = le64toh(f->header->header_size);
2291         while (p != 0) {
2292                 r = journal_file_move_to_object(f, -1, p, &o);
2293                 if (r < 0)
2294                         goto fail;
2295
2296                 switch (o->object.type) {
2297
2298                 case OBJECT_UNUSED:
2299                         printf("Type: OBJECT_UNUSED\n");
2300                         break;
2301
2302                 case OBJECT_DATA:
2303                         printf("Type: OBJECT_DATA\n");
2304                         break;
2305
2306                 case OBJECT_FIELD:
2307                         printf("Type: OBJECT_FIELD\n");
2308                         break;
2309
2310                 case OBJECT_ENTRY:
2311                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2312                                le64toh(o->entry.seqnum),
2313                                le64toh(o->entry.monotonic),
2314                                le64toh(o->entry.realtime));
2315                         break;
2316
2317                 case OBJECT_FIELD_HASH_TABLE:
2318                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2319                         break;
2320
2321                 case OBJECT_DATA_HASH_TABLE:
2322                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2323                         break;
2324
2325                 case OBJECT_ENTRY_ARRAY:
2326                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2327                         break;
2328
2329                 case OBJECT_TAG:
2330                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2331                                le64toh(o->tag.seqnum),
2332                                le64toh(o->tag.epoch));
2333                         break;
2334
2335                 default:
2336                         printf("Type: unknown (%u)\n", o->object.type);
2337                         break;
2338                 }
2339
2340                 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2341                         printf("Flags: %s\n",
2342                                object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2343
2344                 if (p == le64toh(f->header->tail_object_offset))
2345                         p = 0;
2346                 else
2347                         p = p + ALIGN64(le64toh(o->object.size));
2348         }
2349
2350         return;
2351 fail:
2352         log_error("File corrupt");
2353 }
2354
2355 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2356         const char *x;
2357
2358         x = format_timestamp(buf, l, t);
2359         if (x)
2360                 return x;
2361         return " --- ";
2362 }
2363
2364 void journal_file_print_header(JournalFile *f) {
2365         char a[33], b[33], c[33], d[33];
2366         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2367         struct stat st;
2368         char bytes[FORMAT_BYTES_MAX];
2369
2370         assert(f);
2371
2372         printf("File Path: %s\n"
2373                "File ID: %s\n"
2374                "Machine ID: %s\n"
2375                "Boot ID: %s\n"
2376                "Sequential Number ID: %s\n"
2377                "State: %s\n"
2378                "Compatible Flags:%s%s\n"
2379                "Incompatible Flags:%s%s%s\n"
2380                "Header size: %"PRIu64"\n"
2381                "Arena size: %"PRIu64"\n"
2382                "Data Hash Table Size: %"PRIu64"\n"
2383                "Field Hash Table Size: %"PRIu64"\n"
2384                "Rotate Suggested: %s\n"
2385                "Head Sequential Number: %"PRIu64"\n"
2386                "Tail Sequential Number: %"PRIu64"\n"
2387                "Head Realtime Timestamp: %s\n"
2388                "Tail Realtime Timestamp: %s\n"
2389                "Tail Monotonic Timestamp: %s\n"
2390                "Objects: %"PRIu64"\n"
2391                "Entry Objects: %"PRIu64"\n",
2392                f->path,
2393                sd_id128_to_string(f->header->file_id, a),
2394                sd_id128_to_string(f->header->machine_id, b),
2395                sd_id128_to_string(f->header->boot_id, c),
2396                sd_id128_to_string(f->header->seqnum_id, d),
2397                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2398                f->header->state == STATE_ONLINE ? "ONLINE" :
2399                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2400                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2401                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2402                JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2403                JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2404                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2405                le64toh(f->header->header_size),
2406                le64toh(f->header->arena_size),
2407                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2408                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2409                yes_no(journal_file_rotate_suggested(f, 0)),
2410                le64toh(f->header->head_entry_seqnum),
2411                le64toh(f->header->tail_entry_seqnum),
2412                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2413                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2414                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2415                le64toh(f->header->n_objects),
2416                le64toh(f->header->n_entries));
2417
2418         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2419                 printf("Data Objects: %"PRIu64"\n"
2420                        "Data Hash Table Fill: %.1f%%\n",
2421                        le64toh(f->header->n_data),
2422                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2423
2424         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2425                 printf("Field Objects: %"PRIu64"\n"
2426                        "Field Hash Table Fill: %.1f%%\n",
2427                        le64toh(f->header->n_fields),
2428                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2429
2430         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2431                 printf("Tag Objects: %"PRIu64"\n",
2432                        le64toh(f->header->n_tags));
2433         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2434                 printf("Entry Array Objects: %"PRIu64"\n",
2435                        le64toh(f->header->n_entry_arrays));
2436
2437         if (fstat(f->fd, &st) >= 0)
2438                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2439 }
2440
2441 int journal_file_open(
2442                 const char *fname,
2443                 int flags,
2444                 mode_t mode,
2445                 bool compress,
2446                 bool seal,
2447                 JournalMetrics *metrics,
2448                 MMapCache *mmap_cache,
2449                 JournalFile *template,
2450                 JournalFile **ret) {
2451
2452         JournalFile *f;
2453         int r;
2454         bool newly_created = false;
2455
2456         assert(fname);
2457         assert(ret);
2458
2459         if ((flags & O_ACCMODE) != O_RDONLY &&
2460             (flags & O_ACCMODE) != O_RDWR)
2461                 return -EINVAL;
2462
2463         if (!endswith(fname, ".journal") &&
2464             !endswith(fname, ".journal~"))
2465                 return -EINVAL;
2466
2467         f = new0(JournalFile, 1);
2468         if (!f)
2469                 return -ENOMEM;
2470
2471         f->fd = -1;
2472         f->mode = mode;
2473
2474         f->flags = flags;
2475         f->prot = prot_from_flags(flags);
2476         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2477 #if defined(HAVE_LZ4)
2478         f->compress_lz4 = compress;
2479 #elif defined(HAVE_XZ)
2480         f->compress_xz = compress;
2481 #endif
2482 #ifdef HAVE_GCRYPT
2483         f->seal = seal;
2484 #endif
2485
2486         if (mmap_cache)
2487                 f->mmap = mmap_cache_ref(mmap_cache);
2488         else {
2489                 f->mmap = mmap_cache_new();
2490                 if (!f->mmap) {
2491                         r = -ENOMEM;
2492                         goto fail;
2493                 }
2494         }
2495
2496         f->path = strdup(fname);
2497         if (!f->path) {
2498                 r = -ENOMEM;
2499                 goto fail;
2500         }
2501
2502         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2503         if (!f->chain_cache) {
2504                 r = -ENOMEM;
2505                 goto fail;
2506         }
2507
2508         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2509         if (f->fd < 0) {
2510                 r = -errno;
2511                 goto fail;
2512         }
2513
2514         if (fstat(f->fd, &f->last_stat) < 0) {
2515                 r = -errno;
2516                 goto fail;
2517         }
2518
2519         if (f->last_stat.st_size == 0 && f->writable) {
2520                 uint64_t crtime;
2521
2522                 /* Let's attach the creation time to the journal file,
2523                  * so that the vacuuming code knows the age of this
2524                  * file even if the file might end up corrupted one
2525                  * day... Ideally we'd just use the creation time many
2526                  * file systems maintain for each file, but there is
2527                  * currently no usable API to query this, hence let's
2528                  * emulate this via extended attributes. If extended
2529                  * attributes are not supported we'll just skip this,
2530                  * and rely solely on mtime/atime/ctime of the file.*/
2531
2532                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2533                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2534
2535 #ifdef HAVE_GCRYPT
2536                 /* Try to load the FSPRG state, and if we can't, then
2537                  * just don't do sealing */
2538                 if (f->seal) {
2539                         r = journal_file_fss_load(f);
2540                         if (r < 0)
2541                                 f->seal = false;
2542                 }
2543 #endif
2544
2545                 r = journal_file_init_header(f, template);
2546                 if (r < 0)
2547                         goto fail;
2548
2549                 if (fstat(f->fd, &f->last_stat) < 0) {
2550                         r = -errno;
2551                         goto fail;
2552                 }
2553
2554                 newly_created = true;
2555         }
2556
2557         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2558                 r = -EIO;
2559                 goto fail;
2560         }
2561
2562         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2563         if (f->header == MAP_FAILED) {
2564                 f->header = NULL;
2565                 r = -errno;
2566                 goto fail;
2567         }
2568
2569         if (!newly_created) {
2570                 r = journal_file_verify_header(f);
2571                 if (r < 0)
2572                         goto fail;
2573         }
2574
2575 #ifdef HAVE_GCRYPT
2576         if (!newly_created && f->writable) {
2577                 r = journal_file_fss_load(f);
2578                 if (r < 0)
2579                         goto fail;
2580         }
2581 #endif
2582
2583         if (f->writable) {
2584                 if (metrics) {
2585                         journal_default_metrics(metrics, f->fd);
2586                         f->metrics = *metrics;
2587                 } else if (template)
2588                         f->metrics = template->metrics;
2589
2590                 r = journal_file_refresh_header(f);
2591                 if (r < 0)
2592                         goto fail;
2593         }
2594
2595 #ifdef HAVE_GCRYPT
2596         r = journal_file_hmac_setup(f);
2597         if (r < 0)
2598                 goto fail;
2599 #endif
2600
2601         if (newly_created) {
2602                 r = journal_file_setup_field_hash_table(f);
2603                 if (r < 0)
2604                         goto fail;
2605
2606                 r = journal_file_setup_data_hash_table(f);
2607                 if (r < 0)
2608                         goto fail;
2609
2610 #ifdef HAVE_GCRYPT
2611                 r = journal_file_append_first_tag(f);
2612                 if (r < 0)
2613                         goto fail;
2614 #endif
2615         }
2616
2617         r = journal_file_map_field_hash_table(f);
2618         if (r < 0)
2619                 goto fail;
2620
2621         r = journal_file_map_data_hash_table(f);
2622         if (r < 0)
2623                 goto fail;
2624
2625         *ret = f;
2626         return 0;
2627
2628 fail:
2629         journal_file_close(f);
2630
2631         return r;
2632 }
2633
2634 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2635         _cleanup_free_ char *p = NULL;
2636         size_t l;
2637         JournalFile *old_file, *new_file = NULL;
2638         int r;
2639
2640         assert(f);
2641         assert(*f);
2642
2643         old_file = *f;
2644
2645         if (!old_file->writable)
2646                 return -EINVAL;
2647
2648         if (!endswith(old_file->path, ".journal"))
2649                 return -EINVAL;
2650
2651         l = strlen(old_file->path);
2652         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2653                      (int) l - 8, old_file->path,
2654                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2655                      le64toh((*f)->header->head_entry_seqnum),
2656                      le64toh((*f)->header->head_entry_realtime));
2657         if (r < 0)
2658                 return -ENOMEM;
2659
2660         r = rename(old_file->path, p);
2661         if (r < 0)
2662                 return -errno;
2663
2664         old_file->header->state = STATE_ARCHIVED;
2665
2666         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2667         journal_file_close(old_file);
2668
2669         *f = new_file;
2670         return r;
2671 }
2672
2673 int journal_file_open_reliably(
2674                 const char *fname,
2675                 int flags,
2676                 mode_t mode,
2677                 bool compress,
2678                 bool seal,
2679                 JournalMetrics *metrics,
2680                 MMapCache *mmap_cache,
2681                 JournalFile *template,
2682                 JournalFile **ret) {
2683
2684         int r;
2685         size_t l;
2686         _cleanup_free_ char *p = NULL;
2687
2688         r = journal_file_open(fname, flags, mode, compress, seal,
2689                               metrics, mmap_cache, template, ret);
2690         if (r != -EBADMSG && /* corrupted */
2691             r != -ENODATA && /* truncated */
2692             r != -EHOSTDOWN && /* other machine */
2693             r != -EPROTONOSUPPORT && /* incompatible feature */
2694             r != -EBUSY && /* unclean shutdown */
2695             r != -ESHUTDOWN /* already archived */)
2696                 return r;
2697
2698         if ((flags & O_ACCMODE) == O_RDONLY)
2699                 return r;
2700
2701         if (!(flags & O_CREAT))
2702                 return r;
2703
2704         if (!endswith(fname, ".journal"))
2705                 return r;
2706
2707         /* The file is corrupted. Rotate it away and try it again (but only once) */
2708
2709         l = strlen(fname);
2710         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2711                      (int) l - 8, fname,
2712                      (unsigned long long) now(CLOCK_REALTIME),
2713                      random_u64()) < 0)
2714                 return -ENOMEM;
2715
2716         r = rename(fname, p);
2717         if (r < 0)
2718                 return -errno;
2719
2720         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2721
2722         return journal_file_open(fname, flags, mode, compress, seal,
2723                                  metrics, mmap_cache, template, ret);
2724 }
2725
2726 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2727         uint64_t i, n;
2728         uint64_t q, xor_hash = 0;
2729         int r;
2730         EntryItem *items;
2731         dual_timestamp ts;
2732
2733         assert(from);
2734         assert(to);
2735         assert(o);
2736         assert(p);
2737
2738         if (!to->writable)
2739                 return -EPERM;
2740
2741         ts.monotonic = le64toh(o->entry.monotonic);
2742         ts.realtime = le64toh(o->entry.realtime);
2743
2744         n = journal_file_entry_n_items(o);
2745         /* alloca() can't take 0, hence let's allocate at least one */
2746         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2747
2748         for (i = 0; i < n; i++) {
2749                 uint64_t l, h;
2750                 le64_t le_hash;
2751                 size_t t;
2752                 void *data;
2753                 Object *u;
2754
2755                 q = le64toh(o->entry.items[i].object_offset);
2756                 le_hash = o->entry.items[i].hash;
2757
2758                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2759                 if (r < 0)
2760                         return r;
2761
2762                 if (le_hash != o->data.hash)
2763                         return -EBADMSG;
2764
2765                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2766                 t = (size_t) l;
2767
2768                 /* We hit the limit on 32bit machines */
2769                 if ((uint64_t) t != l)
2770                         return -E2BIG;
2771
2772                 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2773 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2774                         size_t rsize;
2775
2776                         r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2777                                             o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2778                         if (r < 0)
2779                                 return r;
2780
2781                         data = from->compress_buffer;
2782                         l = rsize;
2783 #else
2784                         return -EPROTONOSUPPORT;
2785 #endif
2786                 } else
2787                         data = o->data.payload;
2788
2789                 r = journal_file_append_data(to, data, l, &u, &h);
2790                 if (r < 0)
2791                         return r;
2792
2793                 xor_hash ^= le64toh(u->data.hash);
2794                 items[i].object_offset = htole64(h);
2795                 items[i].hash = u->data.hash;
2796
2797                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2798                 if (r < 0)
2799                         return r;
2800         }
2801
2802         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2803 }
2804
2805 void journal_default_metrics(JournalMetrics *m, int fd) {
2806         uint64_t fs_size = 0;
2807         struct statvfs ss;
2808         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2809
2810         assert(m);
2811         assert(fd >= 0);
2812
2813         if (fstatvfs(fd, &ss) >= 0)
2814                 fs_size = ss.f_frsize * ss.f_blocks;
2815
2816         if (m->max_use == (uint64_t) -1) {
2817
2818                 if (fs_size > 0) {
2819                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2820
2821                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2822                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2823
2824                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2825                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2826                 } else
2827                         m->max_use = DEFAULT_MAX_USE_LOWER;
2828         } else {
2829                 m->max_use = PAGE_ALIGN(m->max_use);
2830
2831                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2832                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2833         }
2834
2835         if (m->max_size == (uint64_t) -1) {
2836                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2837
2838                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2839                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2840         } else
2841                 m->max_size = PAGE_ALIGN(m->max_size);
2842
2843         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2844                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2845
2846         if (m->max_size*2 > m->max_use)
2847                 m->max_use = m->max_size*2;
2848
2849         if (m->min_size == (uint64_t) -1)
2850                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2851         else {
2852                 m->min_size = PAGE_ALIGN(m->min_size);
2853
2854                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2855                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2856
2857                 if (m->min_size > m->max_size)
2858                         m->max_size = m->min_size;
2859         }
2860
2861         if (m->keep_free == (uint64_t) -1) {
2862
2863                 if (fs_size > 0) {
2864                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2865
2866                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2867                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2868
2869                 } else
2870                         m->keep_free = DEFAULT_KEEP_FREE;
2871         }
2872
2873         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2874                   format_bytes(a, sizeof(a), m->max_use),
2875                   format_bytes(b, sizeof(b), m->max_size),
2876                   format_bytes(c, sizeof(c), m->min_size),
2877                   format_bytes(d, sizeof(d), m->keep_free));
2878 }
2879
2880 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2881         assert(f);
2882         assert(from || to);
2883
2884         if (from) {
2885                 if (f->header->head_entry_realtime == 0)
2886                         return -ENOENT;
2887
2888                 *from = le64toh(f->header->head_entry_realtime);
2889         }
2890
2891         if (to) {
2892                 if (f->header->tail_entry_realtime == 0)
2893                         return -ENOENT;
2894
2895                 *to = le64toh(f->header->tail_entry_realtime);
2896         }
2897
2898         return 1;
2899 }
2900
2901 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2902         Object *o;
2903         uint64_t p;
2904         int r;
2905
2906         assert(f);
2907         assert(from || to);
2908
2909         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2910         if (r <= 0)
2911                 return r;
2912
2913         if (le64toh(o->data.n_entries) <= 0)
2914                 return 0;
2915
2916         if (from) {
2917                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2918                 if (r < 0)
2919                         return r;
2920
2921                 *from = le64toh(o->entry.monotonic);
2922         }
2923
2924         if (to) {
2925                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2926                 if (r < 0)
2927                         return r;
2928
2929                 r = generic_array_get_plus_one(f,
2930                                                le64toh(o->data.entry_offset),
2931                                                le64toh(o->data.entry_array_offset),
2932                                                le64toh(o->data.n_entries)-1,
2933                                                &o, NULL);
2934                 if (r <= 0)
2935                         return r;
2936
2937                 *to = le64toh(o->entry.monotonic);
2938         }
2939
2940         return 1;
2941 }
2942
2943 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2944         assert(f);
2945
2946         /* If we gained new header fields we gained new features,
2947          * hence suggest a rotation */
2948         if (le64toh(f->header->header_size) < sizeof(Header)) {
2949                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2950                 return true;
2951         }
2952
2953         /* Let's check if the hash tables grew over a certain fill
2954          * level (75%, borrowing this value from Java's hash table
2955          * implementation), and if so suggest a rotation. To calculate
2956          * the fill level we need the n_data field, which only exists
2957          * in newer versions. */
2958
2959         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2960                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2961                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2962                                   f->path,
2963                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2964                                   le64toh(f->header->n_data),
2965                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2966                                   (unsigned long long) f->last_stat.st_size,
2967                                   f->last_stat.st_size / le64toh(f->header->n_data));
2968                         return true;
2969                 }
2970
2971         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2972                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2973                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2974                                   f->path,
2975                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2976                                   le64toh(f->header->n_fields),
2977                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2978                         return true;
2979                 }
2980
2981         /* Are the data objects properly indexed by field objects? */
2982         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2983             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2984             le64toh(f->header->n_data) > 0 &&
2985             le64toh(f->header->n_fields) == 0)
2986                 return true;
2987
2988         if (max_file_usec > 0) {
2989                 usec_t t, h;
2990
2991                 h = le64toh(f->header->head_entry_realtime);
2992                 t = now(CLOCK_REALTIME);
2993
2994                 if (h > 0 && t > h + max_file_usec)
2995                         return true;
2996         }
2997
2998         return false;
2999 }