chiark / gitweb /
tests: add a program for repetitive opening and closing of the journal
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 int journal_file_set_online(JournalFile *f) {
72         assert(f);
73
74         if (!f->writable)
75                 return -EPERM;
76
77         if (!(f->fd >= 0 && f->header))
78                 return -EINVAL;
79
80         switch(f->header->state) {
81                 case STATE_ONLINE:
82                         return 0;
83
84                 case STATE_OFFLINE:
85                         f->header->state = STATE_ONLINE;
86                         fsync(f->fd);
87                         return 0;
88
89                 default:
90                         return -EINVAL;
91         }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95         assert(f);
96
97         if (!f->writable)
98                 return -EPERM;
99
100         if (!(f->fd >= 0 && f->header))
101                 return -EINVAL;
102
103         if (f->header->state != STATE_ONLINE)
104                 return 0;
105
106         fsync(f->fd);
107
108         f->header->state = STATE_OFFLINE;
109
110         fsync(f->fd);
111
112         return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116         assert(f);
117
118 #ifdef HAVE_GCRYPT
119         /* Write the final tag */
120         if (f->seal && f->writable)
121                 journal_file_append_tag(f);
122 #endif
123
124         /* Sync everything to disk, before we mark the file offline */
125         if (f->mmap && f->fd >= 0)
126                 mmap_cache_close_fd(f->mmap, f->fd);
127
128         journal_file_set_offline(f);
129
130         if (f->header)
131                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133         if (f->fd >= 0)
134                 close_nointr_nofail(f->fd);
135
136         free(f->path);
137
138         if (f->mmap)
139                 mmap_cache_unref(f->mmap);
140
141         hashmap_free_free(f->chain_cache);
142
143 #ifdef HAVE_XZ
144         free(f->compress_buffer);
145 #endif
146
147 #ifdef HAVE_GCRYPT
148         if (f->fss_file)
149                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
150         else if (f->fsprg_state)
151                 free(f->fsprg_state);
152
153         free(f->fsprg_seed);
154
155         if (f->hmac)
156                 gcry_md_close(f->hmac);
157 #endif
158
159         free(f);
160 }
161
162 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
163         Header h;
164         ssize_t k;
165         int r;
166
167         assert(f);
168
169         zero(h);
170         memcpy(h.signature, HEADER_SIGNATURE, 8);
171         h.header_size = htole64(ALIGN64(sizeof(h)));
172
173         h.incompatible_flags =
174                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
175
176         h.compatible_flags =
177                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
178
179         r = sd_id128_randomize(&h.file_id);
180         if (r < 0)
181                 return r;
182
183         if (template) {
184                 h.seqnum_id = template->header->seqnum_id;
185                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
186         } else
187                 h.seqnum_id = h.file_id;
188
189         k = pwrite(f->fd, &h, sizeof(h), 0);
190         if (k < 0)
191                 return -errno;
192
193         if (k != sizeof(h))
194                 return -EIO;
195
196         return 0;
197 }
198
199 static int journal_file_refresh_header(JournalFile *f) {
200         int r;
201         sd_id128_t boot_id;
202
203         assert(f);
204
205         r = sd_id128_get_machine(&f->header->machine_id);
206         if (r < 0)
207                 return r;
208
209         r = sd_id128_get_boot(&boot_id);
210         if (r < 0)
211                 return r;
212
213         if (sd_id128_equal(boot_id, f->header->boot_id))
214                 f->tail_entry_monotonic_valid = true;
215
216         f->header->boot_id = boot_id;
217
218         journal_file_set_online(f);
219
220         /* Sync the online state to disk */
221         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
222         fdatasync(f->fd);
223
224         return 0;
225 }
226
227 static int journal_file_verify_header(JournalFile *f) {
228         assert(f);
229
230         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
231                 return -EBADMSG;
232
233         /* In both read and write mode we refuse to open files with
234          * incompatible flags we don't know */
235 #ifdef HAVE_XZ
236         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
237                 return -EPROTONOSUPPORT;
238 #else
239         if (f->header->incompatible_flags != 0)
240                 return -EPROTONOSUPPORT;
241 #endif
242
243         /* When open for writing we refuse to open files with
244          * compatible flags, too */
245         if (f->writable) {
246 #ifdef HAVE_GCRYPT
247                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
248                         return -EPROTONOSUPPORT;
249 #else
250                 if (f->header->compatible_flags != 0)
251                         return -EPROTONOSUPPORT;
252 #endif
253         }
254
255         if (f->header->state >= _STATE_MAX)
256                 return -EBADMSG;
257
258         /* The first addition was n_data, so check that we are at least this large */
259         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
260                 return -EBADMSG;
261
262         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
263                 return -EBADMSG;
264
265         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
266                 return -ENODATA;
267
268         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
269                 return -ENODATA;
270
271         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
272             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
273             !VALID64(le64toh(f->header->tail_object_offset)) ||
274             !VALID64(le64toh(f->header->entry_array_offset)))
275                 return -ENODATA;
276
277         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
278             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
279             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
281                 return -ENODATA;
282
283         if (f->writable) {
284                 uint8_t state;
285                 sd_id128_t machine_id;
286                 int r;
287
288                 r = sd_id128_get_machine(&machine_id);
289                 if (r < 0)
290                         return r;
291
292                 if (!sd_id128_equal(machine_id, f->header->machine_id))
293                         return -EHOSTDOWN;
294
295                 state = f->header->state;
296
297                 if (state == STATE_ONLINE) {
298                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
299                         return -EBUSY;
300                 } else if (state == STATE_ARCHIVED)
301                         return -ESHUTDOWN;
302                 else if (state != STATE_OFFLINE) {
303                         log_debug("Journal file %s has unknown state %u.", f->path, state);
304                         return -EBUSY;
305                 }
306         }
307
308         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
309
310         f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312         return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316         uint64_t old_size, new_size;
317         int r;
318
319         assert(f);
320
321         /* We assume that this file is not sparse, and we know that
322          * for sure, since we always call posix_fallocate()
323          * ourselves */
324
325         old_size =
326                 le64toh(f->header->header_size) +
327                 le64toh(f->header->arena_size);
328
329         new_size = PAGE_ALIGN(offset + size);
330         if (new_size < le64toh(f->header->header_size))
331                 new_size = le64toh(f->header->header_size);
332
333         if (new_size <= old_size)
334                 return 0;
335
336         if (f->metrics.max_size > 0 &&
337             new_size > f->metrics.max_size)
338                 return -E2BIG;
339
340         if (new_size > f->metrics.min_size &&
341             f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Note that the glibc fallocate() fallback is very
360            inefficient, hence we try to minimize the allocation area
361            as we can. */
362         r = posix_fallocate(f->fd, old_size, new_size - old_size);
363         if (r != 0)
364                 return -r;
365
366         if (fstat(f->fd, &f->last_stat) < 0)
367                 return -errno;
368
369         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
370
371         return 0;
372 }
373
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
375         assert(f);
376         assert(ret);
377
378         if (size <= 0)
379                 return -EINVAL;
380
381         /* Avoid SIGBUS on invalid accesses */
382         if (offset + size > (uint64_t) f->last_stat.st_size) {
383                 /* Hmm, out of range? Let's refresh the fstat() data
384                  * first, before we trust that check. */
385
386                 if (fstat(f->fd, &f->last_stat) < 0 ||
387                     offset + size > (uint64_t) f->last_stat.st_size)
388                         return -EADDRNOTAVAIL;
389         }
390
391         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
392 }
393
394 static uint64_t minimum_header_size(Object *o) {
395
396         static uint64_t table[] = {
397                 [OBJECT_DATA] = sizeof(DataObject),
398                 [OBJECT_FIELD] = sizeof(FieldObject),
399                 [OBJECT_ENTRY] = sizeof(EntryObject),
400                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403                 [OBJECT_TAG] = sizeof(TagObject),
404         };
405
406         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407                 return sizeof(ObjectHeader);
408
409         return table[o->object.type];
410 }
411
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
413         int r;
414         void *t;
415         Object *o;
416         uint64_t s;
417         unsigned context;
418
419         assert(f);
420         assert(ret);
421
422         /* Objects may only be located at multiple of 64 bit */
423         if (!VALID64(offset))
424                 return -EFAULT;
425
426         /* One context for each type, plus one catch-all for the rest */
427         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
428
429         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
430         if (r < 0)
431                 return r;
432
433         o = (Object*) t;
434         s = le64toh(o->object.size);
435
436         if (s < sizeof(ObjectHeader))
437                 return -EBADMSG;
438
439         if (o->object.type <= OBJECT_UNUSED)
440                 return -EBADMSG;
441
442         if (s < minimum_header_size(o))
443                 return -EBADMSG;
444
445         if (type > 0 && o->object.type != type)
446                 return -EBADMSG;
447
448         if (s > sizeof(ObjectHeader)) {
449                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
450                 if (r < 0)
451                         return r;
452
453                 o = (Object*) t;
454         }
455
456         *ret = o;
457         return 0;
458 }
459
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
461         uint64_t r;
462
463         assert(f);
464
465         r = le64toh(f->header->tail_entry_seqnum) + 1;
466
467         if (seqnum) {
468                 /* If an external seqnum counter was passed, we update
469                  * both the local and the external one, and set it to
470                  * the maximum of both */
471
472                 if (*seqnum + 1 > r)
473                         r = *seqnum + 1;
474
475                 *seqnum = r;
476         }
477
478         f->header->tail_entry_seqnum = htole64(r);
479
480         if (f->header->head_entry_seqnum == 0)
481                 f->header->head_entry_seqnum = htole64(r);
482
483         return r;
484 }
485
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
487         int r;
488         uint64_t p;
489         Object *tail, *o;
490         void *t;
491
492         assert(f);
493         assert(type > 0 && type < _OBJECT_TYPE_MAX);
494         assert(size >= sizeof(ObjectHeader));
495         assert(offset);
496         assert(ret);
497
498         r = journal_file_set_online(f);
499         if (r < 0)
500                 return r;
501
502         p = le64toh(f->header->tail_object_offset);
503         if (p == 0)
504                 p = le64toh(f->header->header_size);
505         else {
506                 r = journal_file_move_to_object(f, -1, p, &tail);
507                 if (r < 0)
508                         return r;
509
510                 p += ALIGN64(le64toh(tail->object.size));
511         }
512
513         r = journal_file_allocate(f, p, size);
514         if (r < 0)
515                 return r;
516
517         r = journal_file_move_to(f, type, false, p, size, &t);
518         if (r < 0)
519                 return r;
520
521         o = (Object*) t;
522
523         zero(o->object);
524         o->object.type = type;
525         o->object.size = htole64(size);
526
527         f->header->tail_object_offset = htole64(p);
528         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
529
530         *ret = o;
531         *offset = p;
532
533         return 0;
534 }
535
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
537         uint64_t s, p;
538         Object *o;
539         int r;
540
541         assert(f);
542
543         /* We estimate that we need 1 hash table entry per 768 of
544            journal file and we want to make sure we never get beyond
545            75% fill level. Calculate the hash table size for the
546            maximum file size based on these metrics. */
547
548         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
551
552         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
553
554         r = journal_file_append_object(f,
555                                        OBJECT_DATA_HASH_TABLE,
556                                        offsetof(Object, hash_table.items) + s,
557                                        &o, &p);
558         if (r < 0)
559                 return r;
560
561         memset(o->hash_table.items, 0, s);
562
563         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564         f->header->data_hash_table_size = htole64(s);
565
566         return 0;
567 }
568
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
570         uint64_t s, p;
571         Object *o;
572         int r;
573
574         assert(f);
575
576         /* We use a fixed size hash table for the fields as this
577          * number should grow very slowly only */
578
579         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580         r = journal_file_append_object(f,
581                                        OBJECT_FIELD_HASH_TABLE,
582                                        offsetof(Object, hash_table.items) + s,
583                                        &o, &p);
584         if (r < 0)
585                 return r;
586
587         memset(o->hash_table.items, 0, s);
588
589         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590         f->header->field_hash_table_size = htole64(s);
591
592         return 0;
593 }
594
595 static int journal_file_map_data_hash_table(JournalFile *f) {
596         uint64_t s, p;
597         void *t;
598         int r;
599
600         assert(f);
601
602         p = le64toh(f->header->data_hash_table_offset);
603         s = le64toh(f->header->data_hash_table_size);
604
605         r = journal_file_move_to(f,
606                                  OBJECT_DATA_HASH_TABLE,
607                                  true,
608                                  p, s,
609                                  &t);
610         if (r < 0)
611                 return r;
612
613         f->data_hash_table = t;
614         return 0;
615 }
616
617 static int journal_file_map_field_hash_table(JournalFile *f) {
618         uint64_t s, p;
619         void *t;
620         int r;
621
622         assert(f);
623
624         p = le64toh(f->header->field_hash_table_offset);
625         s = le64toh(f->header->field_hash_table_size);
626
627         r = journal_file_move_to(f,
628                                  OBJECT_FIELD_HASH_TABLE,
629                                  true,
630                                  p, s,
631                                  &t);
632         if (r < 0)
633                 return r;
634
635         f->field_hash_table = t;
636         return 0;
637 }
638
639 static int journal_file_link_field(
640                 JournalFile *f,
641                 Object *o,
642                 uint64_t offset,
643                 uint64_t hash) {
644
645         uint64_t p, h;
646         int r;
647
648         assert(f);
649         assert(o);
650         assert(offset > 0);
651
652         if (o->object.type != OBJECT_FIELD)
653                 return -EINVAL;
654
655         /* This might alter the window we are looking at */
656
657         o->field.next_hash_offset = o->field.head_data_offset = 0;
658
659         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->field_hash_table[h].tail_hash_offset);
661         if (p == 0)
662                 f->field_hash_table[h].head_hash_offset = htole64(offset);
663         else {
664                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
665                 if (r < 0)
666                         return r;
667
668                 o->field.next_hash_offset = htole64(offset);
669         }
670
671         f->field_hash_table[h].tail_hash_offset = htole64(offset);
672
673         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
675
676         return 0;
677 }
678
679 static int journal_file_link_data(
680                 JournalFile *f,
681                 Object *o,
682                 uint64_t offset,
683                 uint64_t hash) {
684
685         uint64_t p, h;
686         int r;
687
688         assert(f);
689         assert(o);
690         assert(offset > 0);
691
692         if (o->object.type != OBJECT_DATA)
693                 return -EINVAL;
694
695         /* This might alter the window we are looking at */
696
697         o->data.next_hash_offset = o->data.next_field_offset = 0;
698         o->data.entry_offset = o->data.entry_array_offset = 0;
699         o->data.n_entries = 0;
700
701         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702         p = le64toh(f->data_hash_table[h].tail_hash_offset);
703         if (p == 0)
704                 /* Only entry in the hash table is easy */
705                 f->data_hash_table[h].head_hash_offset = htole64(offset);
706         else {
707                 /* Move back to the previous data object, to patch in
708                  * pointer */
709
710                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
711                 if (r < 0)
712                         return r;
713
714                 o->data.next_hash_offset = htole64(offset);
715         }
716
717         f->data_hash_table[h].tail_hash_offset = htole64(offset);
718
719         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
721
722         return 0;
723 }
724
725 int journal_file_find_field_object_with_hash(
726                 JournalFile *f,
727                 const void *field, uint64_t size, uint64_t hash,
728                 Object **ret, uint64_t *offset) {
729
730         uint64_t p, osize, h;
731         int r;
732
733         assert(f);
734         assert(field && size > 0);
735
736         osize = offsetof(Object, field.payload) + size;
737
738         if (f->header->field_hash_table_size == 0)
739                 return -EBADMSG;
740
741         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742         p = le64toh(f->field_hash_table[h].head_hash_offset);
743
744         while (p > 0) {
745                 Object *o;
746
747                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
748                 if (r < 0)
749                         return r;
750
751                 if (le64toh(o->field.hash) == hash &&
752                     le64toh(o->object.size) == osize &&
753                     memcmp(o->field.payload, field, size) == 0) {
754
755                         if (ret)
756                                 *ret = o;
757                         if (offset)
758                                 *offset = p;
759
760                         return 1;
761                 }
762
763                 p = le64toh(o->field.next_hash_offset);
764         }
765
766         return 0;
767 }
768
769 int journal_file_find_field_object(
770                 JournalFile *f,
771                 const void *field, uint64_t size,
772                 Object **ret, uint64_t *offset) {
773
774         uint64_t hash;
775
776         assert(f);
777         assert(field && size > 0);
778
779         hash = hash64(field, size);
780
781         return journal_file_find_field_object_with_hash(f,
782                                                         field, size, hash,
783                                                         ret, offset);
784 }
785
786 int journal_file_find_data_object_with_hash(
787                 JournalFile *f,
788                 const void *data, uint64_t size, uint64_t hash,
789                 Object **ret, uint64_t *offset) {
790
791         uint64_t p, osize, h;
792         int r;
793
794         assert(f);
795         assert(data || size == 0);
796
797         osize = offsetof(Object, data.payload) + size;
798
799         if (f->header->data_hash_table_size == 0)
800                 return -EBADMSG;
801
802         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803         p = le64toh(f->data_hash_table[h].head_hash_offset);
804
805         while (p > 0) {
806                 Object *o;
807
808                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
809                 if (r < 0)
810                         return r;
811
812                 if (le64toh(o->data.hash) != hash)
813                         goto next;
814
815                 if (o->object.flags & OBJECT_COMPRESSED) {
816 #ifdef HAVE_XZ
817                         uint64_t l, rsize;
818
819                         l = le64toh(o->object.size);
820                         if (l <= offsetof(Object, data.payload))
821                                 return -EBADMSG;
822
823                         l -= offsetof(Object, data.payload);
824
825                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
826                                 return -EBADMSG;
827
828                         if (rsize == size &&
829                             memcmp(f->compress_buffer, data, size) == 0) {
830
831                                 if (ret)
832                                         *ret = o;
833
834                                 if (offset)
835                                         *offset = p;
836
837                                 return 1;
838                         }
839 #else
840                         return -EPROTONOSUPPORT;
841 #endif
842
843                 } else if (le64toh(o->object.size) == osize &&
844                            memcmp(o->data.payload, data, size) == 0) {
845
846                         if (ret)
847                                 *ret = o;
848
849                         if (offset)
850                                 *offset = p;
851
852                         return 1;
853                 }
854
855         next:
856                 p = le64toh(o->data.next_hash_offset);
857         }
858
859         return 0;
860 }
861
862 int journal_file_find_data_object(
863                 JournalFile *f,
864                 const void *data, uint64_t size,
865                 Object **ret, uint64_t *offset) {
866
867         uint64_t hash;
868
869         assert(f);
870         assert(data || size == 0);
871
872         hash = hash64(data, size);
873
874         return journal_file_find_data_object_with_hash(f,
875                                                        data, size, hash,
876                                                        ret, offset);
877 }
878
879 static int journal_file_append_field(
880                 JournalFile *f,
881                 const void *field, uint64_t size,
882                 Object **ret, uint64_t *offset) {
883
884         uint64_t hash, p;
885         uint64_t osize;
886         Object *o;
887         int r;
888
889         assert(f);
890         assert(field && size > 0);
891
892         hash = hash64(field, size);
893
894         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
895         if (r < 0)
896                 return r;
897         else if (r > 0) {
898
899                 if (ret)
900                         *ret = o;
901
902                 if (offset)
903                         *offset = p;
904
905                 return 0;
906         }
907
908         osize = offsetof(Object, field.payload) + size;
909         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
910
911         o->field.hash = htole64(hash);
912         memcpy(o->field.payload, field, size);
913
914         r = journal_file_link_field(f, o, p, hash);
915         if (r < 0)
916                 return r;
917
918         /* The linking might have altered the window, so let's
919          * refresh our pointer */
920         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
921         if (r < 0)
922                 return r;
923
924 #ifdef HAVE_GCRYPT
925         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
926         if (r < 0)
927                 return r;
928 #endif
929
930         if (ret)
931                 *ret = o;
932
933         if (offset)
934                 *offset = p;
935
936         return 0;
937 }
938
939 static int journal_file_append_data(
940                 JournalFile *f,
941                 const void *data, uint64_t size,
942                 Object **ret, uint64_t *offset) {
943
944         uint64_t hash, p;
945         uint64_t osize;
946         Object *o;
947         int r;
948         bool compressed = false;
949         const void *eq;
950
951         assert(f);
952         assert(data || size == 0);
953
954         hash = hash64(data, size);
955
956         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
957         if (r < 0)
958                 return r;
959         else if (r > 0) {
960
961                 if (ret)
962                         *ret = o;
963
964                 if (offset)
965                         *offset = p;
966
967                 return 0;
968         }
969
970         osize = offsetof(Object, data.payload) + size;
971         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
972         if (r < 0)
973                 return r;
974
975         o->data.hash = htole64(hash);
976
977 #ifdef HAVE_XZ
978         if (f->compress &&
979             size >= COMPRESSION_SIZE_THRESHOLD) {
980                 uint64_t rsize;
981
982                 compressed = compress_blob(data, size, o->data.payload, &rsize);
983
984                 if (compressed) {
985                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
986                         o->object.flags |= OBJECT_COMPRESSED;
987
988                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
989                 }
990         }
991 #endif
992
993         if (!compressed && size > 0)
994                 memcpy(o->data.payload, data, size);
995
996         r = journal_file_link_data(f, o, p, hash);
997         if (r < 0)
998                 return r;
999
1000         /* The linking might have altered the window, so let's
1001          * refresh our pointer */
1002         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1003         if (r < 0)
1004                 return r;
1005
1006         eq = memchr(data, '=', size);
1007         if (eq && eq > data) {
1008                 uint64_t fp;
1009                 Object *fo;
1010
1011                 /* Create field object ... */
1012                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1013                 if (r < 0)
1014                         return r;
1015
1016                 /* ... and link it in. */
1017                 o->data.next_field_offset = fo->field.head_data_offset;
1018                 fo->field.head_data_offset = le64toh(p);
1019         }
1020
1021 #ifdef HAVE_GCRYPT
1022         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1023         if (r < 0)
1024                 return r;
1025 #endif
1026
1027         if (ret)
1028                 *ret = o;
1029
1030         if (offset)
1031                 *offset = p;
1032
1033         return 0;
1034 }
1035
1036 uint64_t journal_file_entry_n_items(Object *o) {
1037         assert(o);
1038
1039         if (o->object.type != OBJECT_ENTRY)
1040                 return 0;
1041
1042         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1043 }
1044
1045 uint64_t journal_file_entry_array_n_items(Object *o) {
1046         assert(o);
1047
1048         if (o->object.type != OBJECT_ENTRY_ARRAY)
1049                 return 0;
1050
1051         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1052 }
1053
1054 uint64_t journal_file_hash_table_n_items(Object *o) {
1055         assert(o);
1056
1057         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1058             o->object.type != OBJECT_FIELD_HASH_TABLE)
1059                 return 0;
1060
1061         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1062 }
1063
1064 static int link_entry_into_array(JournalFile *f,
1065                                  le64_t *first,
1066                                  le64_t *idx,
1067                                  uint64_t p) {
1068         int r;
1069         uint64_t n = 0, ap = 0, q, i, a, hidx;
1070         Object *o;
1071
1072         assert(f);
1073         assert(first);
1074         assert(idx);
1075         assert(p > 0);
1076
1077         a = le64toh(*first);
1078         i = hidx = le64toh(*idx);
1079         while (a > 0) {
1080
1081                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1082                 if (r < 0)
1083                         return r;
1084
1085                 n = journal_file_entry_array_n_items(o);
1086                 if (i < n) {
1087                         o->entry_array.items[i] = htole64(p);
1088                         *idx = htole64(hidx + 1);
1089                         return 0;
1090                 }
1091
1092                 i -= n;
1093                 ap = a;
1094                 a = le64toh(o->entry_array.next_entry_array_offset);
1095         }
1096
1097         if (hidx > n)
1098                 n = (hidx+1) * 2;
1099         else
1100                 n = n * 2;
1101
1102         if (n < 4)
1103                 n = 4;
1104
1105         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1106                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1107                                        &o, &q);
1108         if (r < 0)
1109                 return r;
1110
1111 #ifdef HAVE_GCRYPT
1112         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1113         if (r < 0)
1114                 return r;
1115 #endif
1116
1117         o->entry_array.items[i] = htole64(p);
1118
1119         if (ap == 0)
1120                 *first = htole64(q);
1121         else {
1122                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1123                 if (r < 0)
1124                         return r;
1125
1126                 o->entry_array.next_entry_array_offset = htole64(q);
1127         }
1128
1129         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1130                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1131
1132         *idx = htole64(hidx + 1);
1133
1134         return 0;
1135 }
1136
1137 static int link_entry_into_array_plus_one(JournalFile *f,
1138                                           le64_t *extra,
1139                                           le64_t *first,
1140                                           le64_t *idx,
1141                                           uint64_t p) {
1142
1143         int r;
1144
1145         assert(f);
1146         assert(extra);
1147         assert(first);
1148         assert(idx);
1149         assert(p > 0);
1150
1151         if (*idx == 0)
1152                 *extra = htole64(p);
1153         else {
1154                 le64_t i;
1155
1156                 i = htole64(le64toh(*idx) - 1);
1157                 r = link_entry_into_array(f, first, &i, p);
1158                 if (r < 0)
1159                         return r;
1160         }
1161
1162         *idx = htole64(le64toh(*idx) + 1);
1163         return 0;
1164 }
1165
1166 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1167         uint64_t p;
1168         int r;
1169         assert(f);
1170         assert(o);
1171         assert(offset > 0);
1172
1173         p = le64toh(o->entry.items[i].object_offset);
1174         if (p == 0)
1175                 return -EINVAL;
1176
1177         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1178         if (r < 0)
1179                 return r;
1180
1181         return link_entry_into_array_plus_one(f,
1182                                               &o->data.entry_offset,
1183                                               &o->data.entry_array_offset,
1184                                               &o->data.n_entries,
1185                                               offset);
1186 }
1187
1188 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1189         uint64_t n, i;
1190         int r;
1191
1192         assert(f);
1193         assert(o);
1194         assert(offset > 0);
1195
1196         if (o->object.type != OBJECT_ENTRY)
1197                 return -EINVAL;
1198
1199         __sync_synchronize();
1200
1201         /* Link up the entry itself */
1202         r = link_entry_into_array(f,
1203                                   &f->header->entry_array_offset,
1204                                   &f->header->n_entries,
1205                                   offset);
1206         if (r < 0)
1207                 return r;
1208
1209         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1210
1211         if (f->header->head_entry_realtime == 0)
1212                 f->header->head_entry_realtime = o->entry.realtime;
1213
1214         f->header->tail_entry_realtime = o->entry.realtime;
1215         f->header->tail_entry_monotonic = o->entry.monotonic;
1216
1217         f->tail_entry_monotonic_valid = true;
1218
1219         /* Link up the items */
1220         n = journal_file_entry_n_items(o);
1221         for (i = 0; i < n; i++) {
1222                 r = journal_file_link_entry_item(f, o, offset, i);
1223                 if (r < 0)
1224                         return r;
1225         }
1226
1227         return 0;
1228 }
1229
1230 static int journal_file_append_entry_internal(
1231                 JournalFile *f,
1232                 const dual_timestamp *ts,
1233                 uint64_t xor_hash,
1234                 const EntryItem items[], unsigned n_items,
1235                 uint64_t *seqnum,
1236                 Object **ret, uint64_t *offset) {
1237         uint64_t np;
1238         uint64_t osize;
1239         Object *o;
1240         int r;
1241
1242         assert(f);
1243         assert(items || n_items == 0);
1244         assert(ts);
1245
1246         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1247
1248         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1249         if (r < 0)
1250                 return r;
1251
1252         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1253         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1254         o->entry.realtime = htole64(ts->realtime);
1255         o->entry.monotonic = htole64(ts->monotonic);
1256         o->entry.xor_hash = htole64(xor_hash);
1257         o->entry.boot_id = f->header->boot_id;
1258
1259 #ifdef HAVE_GCRYPT
1260         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1261         if (r < 0)
1262                 return r;
1263 #endif
1264
1265         r = journal_file_link_entry(f, o, np);
1266         if (r < 0)
1267                 return r;
1268
1269         if (ret)
1270                 *ret = o;
1271
1272         if (offset)
1273                 *offset = np;
1274
1275         return 0;
1276 }
1277
1278 void journal_file_post_change(JournalFile *f) {
1279         assert(f);
1280
1281         /* inotify() does not receive IN_MODIFY events from file
1282          * accesses done via mmap(). After each access we hence
1283          * trigger IN_MODIFY by truncating the journal file to its
1284          * current size which triggers IN_MODIFY. */
1285
1286         __sync_synchronize();
1287
1288         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1289                 log_error("Failed to truncate file to its own size: %m");
1290 }
1291
1292 static int entry_item_cmp(const void *_a, const void *_b) {
1293         const EntryItem *a = _a, *b = _b;
1294
1295         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1296                 return -1;
1297         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1298                 return 1;
1299         return 0;
1300 }
1301
1302 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1303         unsigned i;
1304         EntryItem *items;
1305         int r;
1306         uint64_t xor_hash = 0;
1307         struct dual_timestamp _ts;
1308
1309         assert(f);
1310         assert(iovec || n_iovec == 0);
1311
1312         if (!ts) {
1313                 dual_timestamp_get(&_ts);
1314                 ts = &_ts;
1315         }
1316
1317         if (f->tail_entry_monotonic_valid &&
1318             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1319                 return -EINVAL;
1320
1321 #ifdef HAVE_GCRYPT
1322         r = journal_file_maybe_append_tag(f, ts->realtime);
1323         if (r < 0)
1324                 return r;
1325 #endif
1326
1327         /* alloca() can't take 0, hence let's allocate at least one */
1328         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1329
1330         for (i = 0; i < n_iovec; i++) {
1331                 uint64_t p;
1332                 Object *o;
1333
1334                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1335                 if (r < 0)
1336                         return r;
1337
1338                 xor_hash ^= le64toh(o->data.hash);
1339                 items[i].object_offset = htole64(p);
1340                 items[i].hash = o->data.hash;
1341         }
1342
1343         /* Order by the position on disk, in order to improve seek
1344          * times for rotating media. */
1345         qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1346
1347         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1348
1349         journal_file_post_change(f);
1350
1351         return r;
1352 }
1353
1354 typedef struct ChainCacheItem {
1355         uint64_t first; /* the array at the begin of the chain */
1356         uint64_t array; /* the cached array */
1357         uint64_t begin; /* the first item in the cached array */
1358         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1359 } ChainCacheItem;
1360
1361 static void chain_cache_put(
1362                 Hashmap *h,
1363                 ChainCacheItem *ci,
1364                 uint64_t first,
1365                 uint64_t array,
1366                 uint64_t begin,
1367                 uint64_t total) {
1368
1369         if (!ci) {
1370                 /* If the chain item to cache for this chain is the
1371                  * first one it's not worth caching anything */
1372                 if (array == first)
1373                         return;
1374
1375                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1376                         ci = hashmap_steal_first(h);
1377                 else {
1378                         ci = new(ChainCacheItem, 1);
1379                         if (!ci)
1380                                 return;
1381                 }
1382
1383                 ci->first = first;
1384
1385                 if (hashmap_put(h, &ci->first, ci) < 0) {
1386                         free(ci);
1387                         return;
1388                 }
1389         } else
1390                 assert(ci->first == first);
1391
1392         ci->array = array;
1393         ci->begin = begin;
1394         ci->total = total;
1395 }
1396
1397 static int generic_array_get(JournalFile *f,
1398                              uint64_t first,
1399                              uint64_t i,
1400                              Object **ret, uint64_t *offset) {
1401
1402         Object *o;
1403         uint64_t p = 0, a, t = 0;
1404         int r;
1405         ChainCacheItem *ci;
1406
1407         assert(f);
1408
1409         a = first;
1410
1411         /* Try the chain cache first */
1412         ci = hashmap_get(f->chain_cache, &first);
1413         if (ci && i > ci->total) {
1414                 a = ci->array;
1415                 i -= ci->total;
1416                 t = ci->total;
1417         }
1418
1419         while (a > 0) {
1420                 uint64_t k;
1421
1422                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1423                 if (r < 0)
1424                         return r;
1425
1426                 k = journal_file_entry_array_n_items(o);
1427                 if (i < k) {
1428                         p = le64toh(o->entry_array.items[i]);
1429                         goto found;
1430                 }
1431
1432                 i -= k;
1433                 t += k;
1434                 a = le64toh(o->entry_array.next_entry_array_offset);
1435         }
1436
1437         return 0;
1438
1439 found:
1440         /* Let's cache this item for the next invocation */
1441         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1442
1443         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1444         if (r < 0)
1445                 return r;
1446
1447         if (ret)
1448                 *ret = o;
1449
1450         if (offset)
1451                 *offset = p;
1452
1453         return 1;
1454 }
1455
1456 static int generic_array_get_plus_one(JournalFile *f,
1457                                       uint64_t extra,
1458                                       uint64_t first,
1459                                       uint64_t i,
1460                                       Object **ret, uint64_t *offset) {
1461
1462         Object *o;
1463
1464         assert(f);
1465
1466         if (i == 0) {
1467                 int r;
1468
1469                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1470                 if (r < 0)
1471                         return r;
1472
1473                 if (ret)
1474                         *ret = o;
1475
1476                 if (offset)
1477                         *offset = extra;
1478
1479                 return 1;
1480         }
1481
1482         return generic_array_get(f, first, i-1, ret, offset);
1483 }
1484
1485 enum {
1486         TEST_FOUND,
1487         TEST_LEFT,
1488         TEST_RIGHT
1489 };
1490
1491 static int generic_array_bisect(JournalFile *f,
1492                                 uint64_t first,
1493                                 uint64_t n,
1494                                 uint64_t needle,
1495                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1496                                 direction_t direction,
1497                                 Object **ret,
1498                                 uint64_t *offset,
1499                                 uint64_t *idx) {
1500
1501         uint64_t a, p, t = 0, i = 0, last_p = 0;
1502         bool subtract_one = false;
1503         Object *o, *array = NULL;
1504         int r;
1505         ChainCacheItem *ci;
1506
1507         assert(f);
1508         assert(test_object);
1509
1510         /* Start with the first array in the chain */
1511         a = first;
1512
1513         ci = hashmap_get(f->chain_cache, &first);
1514         if (ci && n > ci->total) {
1515                 /* Ah, we have iterated this bisection array chain
1516                  * previously! Let's see if we can skip ahead in the
1517                  * chain, as far as the last time. But we can't jump
1518                  * backwards in the chain, so let's check that
1519                  * first. */
1520
1521                 r = test_object(f, ci->begin, needle);
1522                 if (r < 0)
1523                         return r;
1524
1525                 if (r == TEST_LEFT) {
1526                         /* OK, what we are looking for is right of th
1527                          * begin of this EntryArray, so let's jump
1528                          * straight to previously cached array in the
1529                          * chain */
1530
1531                         a = ci->array;
1532                         n -= ci->total;
1533                         t = ci->total;
1534                 }
1535         }
1536
1537         while (a > 0) {
1538                 uint64_t left, right, k, lp;
1539
1540                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1541                 if (r < 0)
1542                         return r;
1543
1544                 k = journal_file_entry_array_n_items(array);
1545                 right = MIN(k, n);
1546                 if (right <= 0)
1547                         return 0;
1548
1549                 i = right - 1;
1550                 lp = p = le64toh(array->entry_array.items[i]);
1551                 if (p <= 0)
1552                         return -EBADMSG;
1553
1554                 r = test_object(f, p, needle);
1555                 if (r < 0)
1556                         return r;
1557
1558                 if (r == TEST_FOUND)
1559                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1560
1561                 if (r == TEST_RIGHT) {
1562                         left = 0;
1563                         right -= 1;
1564                         for (;;) {
1565                                 if (left == right) {
1566                                         if (direction == DIRECTION_UP)
1567                                                 subtract_one = true;
1568
1569                                         i = left;
1570                                         goto found;
1571                                 }
1572
1573                                 assert(left < right);
1574
1575                                 i = (left + right) / 2;
1576                                 p = le64toh(array->entry_array.items[i]);
1577                                 if (p <= 0)
1578                                         return -EBADMSG;
1579
1580                                 r = test_object(f, p, needle);
1581                                 if (r < 0)
1582                                         return r;
1583
1584                                 if (r == TEST_FOUND)
1585                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1586
1587                                 if (r == TEST_RIGHT)
1588                                         right = i;
1589                                 else
1590                                         left = i + 1;
1591                         }
1592                 }
1593
1594                 if (k > n) {
1595                         if (direction == DIRECTION_UP) {
1596                                 i = n;
1597                                 subtract_one = true;
1598                                 goto found;
1599                         }
1600
1601                         return 0;
1602                 }
1603
1604                 last_p = lp;
1605
1606                 n -= k;
1607                 t += k;
1608                 a = le64toh(array->entry_array.next_entry_array_offset);
1609         }
1610
1611         return 0;
1612
1613 found:
1614         if (subtract_one && t == 0 && i == 0)
1615                 return 0;
1616
1617         /* Let's cache this item for the next invocation */
1618         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1619
1620         if (subtract_one && i == 0)
1621                 p = last_p;
1622         else if (subtract_one)
1623                 p = le64toh(array->entry_array.items[i-1]);
1624         else
1625                 p = le64toh(array->entry_array.items[i]);
1626
1627         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1628         if (r < 0)
1629                 return r;
1630
1631         if (ret)
1632                 *ret = o;
1633
1634         if (offset)
1635                 *offset = p;
1636
1637         if (idx)
1638                 *idx = t + i + (subtract_one ? -1 : 0);
1639
1640         return 1;
1641 }
1642
1643 static int generic_array_bisect_plus_one(JournalFile *f,
1644                                          uint64_t extra,
1645                                          uint64_t first,
1646                                          uint64_t n,
1647                                          uint64_t needle,
1648                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1649                                          direction_t direction,
1650                                          Object **ret,
1651                                          uint64_t *offset,
1652                                          uint64_t *idx) {
1653
1654         int r;
1655         bool step_back = false;
1656         Object *o;
1657
1658         assert(f);
1659         assert(test_object);
1660
1661         if (n <= 0)
1662                 return 0;
1663
1664         /* This bisects the array in object 'first', but first checks
1665          * an extra  */
1666         r = test_object(f, extra, needle);
1667         if (r < 0)
1668                 return r;
1669
1670         if (r == TEST_FOUND)
1671                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1672
1673         /* if we are looking with DIRECTION_UP then we need to first
1674            see if in the actual array there is a matching entry, and
1675            return the last one of that. But if there isn't any we need
1676            to return this one. Hence remember this, and return it
1677            below. */
1678         if (r == TEST_LEFT)
1679                 step_back = direction == DIRECTION_UP;
1680
1681         if (r == TEST_RIGHT) {
1682                 if (direction == DIRECTION_DOWN)
1683                         goto found;
1684                 else
1685                         return 0;
1686         }
1687
1688         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1689
1690         if (r == 0 && step_back)
1691                 goto found;
1692
1693         if (r > 0 && idx)
1694                 (*idx) ++;
1695
1696         return r;
1697
1698 found:
1699         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1700         if (r < 0)
1701                 return r;
1702
1703         if (ret)
1704                 *ret = o;
1705
1706         if (offset)
1707                 *offset = extra;
1708
1709         if (idx)
1710                 *idx = 0;
1711
1712         return 1;
1713 }
1714
1715 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1716         assert(f);
1717         assert(p > 0);
1718
1719         if (p == needle)
1720                 return TEST_FOUND;
1721         else if (p < needle)
1722                 return TEST_LEFT;
1723         else
1724                 return TEST_RIGHT;
1725 }
1726
1727 int journal_file_move_to_entry_by_offset(
1728                 JournalFile *f,
1729                 uint64_t p,
1730                 direction_t direction,
1731                 Object **ret,
1732                 uint64_t *offset) {
1733
1734         return generic_array_bisect(f,
1735                                     le64toh(f->header->entry_array_offset),
1736                                     le64toh(f->header->n_entries),
1737                                     p,
1738                                     test_object_offset,
1739                                     direction,
1740                                     ret, offset, NULL);
1741 }
1742
1743
1744 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1745         Object *o;
1746         int r;
1747
1748         assert(f);
1749         assert(p > 0);
1750
1751         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1752         if (r < 0)
1753                 return r;
1754
1755         if (le64toh(o->entry.seqnum) == needle)
1756                 return TEST_FOUND;
1757         else if (le64toh(o->entry.seqnum) < needle)
1758                 return TEST_LEFT;
1759         else
1760                 return TEST_RIGHT;
1761 }
1762
1763 int journal_file_move_to_entry_by_seqnum(
1764                 JournalFile *f,
1765                 uint64_t seqnum,
1766                 direction_t direction,
1767                 Object **ret,
1768                 uint64_t *offset) {
1769
1770         return generic_array_bisect(f,
1771                                     le64toh(f->header->entry_array_offset),
1772                                     le64toh(f->header->n_entries),
1773                                     seqnum,
1774                                     test_object_seqnum,
1775                                     direction,
1776                                     ret, offset, NULL);
1777 }
1778
1779 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1780         Object *o;
1781         int r;
1782
1783         assert(f);
1784         assert(p > 0);
1785
1786         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1787         if (r < 0)
1788                 return r;
1789
1790         if (le64toh(o->entry.realtime) == needle)
1791                 return TEST_FOUND;
1792         else if (le64toh(o->entry.realtime) < needle)
1793                 return TEST_LEFT;
1794         else
1795                 return TEST_RIGHT;
1796 }
1797
1798 int journal_file_move_to_entry_by_realtime(
1799                 JournalFile *f,
1800                 uint64_t realtime,
1801                 direction_t direction,
1802                 Object **ret,
1803                 uint64_t *offset) {
1804
1805         return generic_array_bisect(f,
1806                                     le64toh(f->header->entry_array_offset),
1807                                     le64toh(f->header->n_entries),
1808                                     realtime,
1809                                     test_object_realtime,
1810                                     direction,
1811                                     ret, offset, NULL);
1812 }
1813
1814 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1815         Object *o;
1816         int r;
1817
1818         assert(f);
1819         assert(p > 0);
1820
1821         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1822         if (r < 0)
1823                 return r;
1824
1825         if (le64toh(o->entry.monotonic) == needle)
1826                 return TEST_FOUND;
1827         else if (le64toh(o->entry.monotonic) < needle)
1828                 return TEST_LEFT;
1829         else
1830                 return TEST_RIGHT;
1831 }
1832
1833 static inline int find_data_object_by_boot_id(
1834                 JournalFile *f,
1835                 sd_id128_t boot_id,
1836                 Object **o,
1837                 uint64_t *b) {
1838         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1839
1840         sd_id128_to_string(boot_id, t + 9);
1841         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1842 }
1843
1844 int journal_file_move_to_entry_by_monotonic(
1845                 JournalFile *f,
1846                 sd_id128_t boot_id,
1847                 uint64_t monotonic,
1848                 direction_t direction,
1849                 Object **ret,
1850                 uint64_t *offset) {
1851
1852         Object *o;
1853         int r;
1854
1855         assert(f);
1856
1857         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1858         if (r < 0)
1859                 return r;
1860         if (r == 0)
1861                 return -ENOENT;
1862
1863         return generic_array_bisect_plus_one(f,
1864                                              le64toh(o->data.entry_offset),
1865                                              le64toh(o->data.entry_array_offset),
1866                                              le64toh(o->data.n_entries),
1867                                              monotonic,
1868                                              test_object_monotonic,
1869                                              direction,
1870                                              ret, offset, NULL);
1871 }
1872
1873 int journal_file_next_entry(
1874                 JournalFile *f,
1875                 Object *o, uint64_t p,
1876                 direction_t direction,
1877                 Object **ret, uint64_t *offset) {
1878
1879         uint64_t i, n;
1880         int r;
1881
1882         assert(f);
1883         assert(p > 0 || !o);
1884
1885         n = le64toh(f->header->n_entries);
1886         if (n <= 0)
1887                 return 0;
1888
1889         if (!o)
1890                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1891         else {
1892                 if (o->object.type != OBJECT_ENTRY)
1893                         return -EINVAL;
1894
1895                 r = generic_array_bisect(f,
1896                                          le64toh(f->header->entry_array_offset),
1897                                          le64toh(f->header->n_entries),
1898                                          p,
1899                                          test_object_offset,
1900                                          DIRECTION_DOWN,
1901                                          NULL, NULL,
1902                                          &i);
1903                 if (r <= 0)
1904                         return r;
1905
1906                 if (direction == DIRECTION_DOWN) {
1907                         if (i >= n - 1)
1908                                 return 0;
1909
1910                         i++;
1911                 } else {
1912                         if (i <= 0)
1913                                 return 0;
1914
1915                         i--;
1916                 }
1917         }
1918
1919         /* And jump to it */
1920         return generic_array_get(f,
1921                                  le64toh(f->header->entry_array_offset),
1922                                  i,
1923                                  ret, offset);
1924 }
1925
1926 int journal_file_skip_entry(
1927                 JournalFile *f,
1928                 Object *o, uint64_t p,
1929                 int64_t skip,
1930                 Object **ret, uint64_t *offset) {
1931
1932         uint64_t i, n;
1933         int r;
1934
1935         assert(f);
1936         assert(o);
1937         assert(p > 0);
1938
1939         if (o->object.type != OBJECT_ENTRY)
1940                 return -EINVAL;
1941
1942         r = generic_array_bisect(f,
1943                                  le64toh(f->header->entry_array_offset),
1944                                  le64toh(f->header->n_entries),
1945                                  p,
1946                                  test_object_offset,
1947                                  DIRECTION_DOWN,
1948                                  NULL, NULL,
1949                                  &i);
1950         if (r <= 0)
1951                 return r;
1952
1953         /* Calculate new index */
1954         if (skip < 0) {
1955                 if ((uint64_t) -skip >= i)
1956                         i = 0;
1957                 else
1958                         i = i - (uint64_t) -skip;
1959         } else
1960                 i  += (uint64_t) skip;
1961
1962         n = le64toh(f->header->n_entries);
1963         if (n <= 0)
1964                 return -EBADMSG;
1965
1966         if (i >= n)
1967                 i = n-1;
1968
1969         return generic_array_get(f,
1970                                  le64toh(f->header->entry_array_offset),
1971                                  i,
1972                                  ret, offset);
1973 }
1974
1975 int journal_file_next_entry_for_data(
1976                 JournalFile *f,
1977                 Object *o, uint64_t p,
1978                 uint64_t data_offset,
1979                 direction_t direction,
1980                 Object **ret, uint64_t *offset) {
1981
1982         uint64_t n, i;
1983         int r;
1984         Object *d;
1985
1986         assert(f);
1987         assert(p > 0 || !o);
1988
1989         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1990         if (r < 0)
1991                 return r;
1992
1993         n = le64toh(d->data.n_entries);
1994         if (n <= 0)
1995                 return n;
1996
1997         if (!o)
1998                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1999         else {
2000                 if (o->object.type != OBJECT_ENTRY)
2001                         return -EINVAL;
2002
2003                 r = generic_array_bisect_plus_one(f,
2004                                                   le64toh(d->data.entry_offset),
2005                                                   le64toh(d->data.entry_array_offset),
2006                                                   le64toh(d->data.n_entries),
2007                                                   p,
2008                                                   test_object_offset,
2009                                                   DIRECTION_DOWN,
2010                                                   NULL, NULL,
2011                                                   &i);
2012
2013                 if (r <= 0)
2014                         return r;
2015
2016                 if (direction == DIRECTION_DOWN) {
2017                         if (i >= n - 1)
2018                                 return 0;
2019
2020                         i++;
2021                 } else {
2022                         if (i <= 0)
2023                                 return 0;
2024
2025                         i--;
2026                 }
2027
2028         }
2029
2030         return generic_array_get_plus_one(f,
2031                                           le64toh(d->data.entry_offset),
2032                                           le64toh(d->data.entry_array_offset),
2033                                           i,
2034                                           ret, offset);
2035 }
2036
2037 int journal_file_move_to_entry_by_offset_for_data(
2038                 JournalFile *f,
2039                 uint64_t data_offset,
2040                 uint64_t p,
2041                 direction_t direction,
2042                 Object **ret, uint64_t *offset) {
2043
2044         int r;
2045         Object *d;
2046
2047         assert(f);
2048
2049         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2050         if (r < 0)
2051                 return r;
2052
2053         return generic_array_bisect_plus_one(f,
2054                                              le64toh(d->data.entry_offset),
2055                                              le64toh(d->data.entry_array_offset),
2056                                              le64toh(d->data.n_entries),
2057                                              p,
2058                                              test_object_offset,
2059                                              direction,
2060                                              ret, offset, NULL);
2061 }
2062
2063 int journal_file_move_to_entry_by_monotonic_for_data(
2064                 JournalFile *f,
2065                 uint64_t data_offset,
2066                 sd_id128_t boot_id,
2067                 uint64_t monotonic,
2068                 direction_t direction,
2069                 Object **ret, uint64_t *offset) {
2070
2071         Object *o, *d;
2072         int r;
2073         uint64_t b, z;
2074
2075         assert(f);
2076
2077         /* First, seek by time */
2078         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2079         if (r < 0)
2080                 return r;
2081         if (r == 0)
2082                 return -ENOENT;
2083
2084         r = generic_array_bisect_plus_one(f,
2085                                           le64toh(o->data.entry_offset),
2086                                           le64toh(o->data.entry_array_offset),
2087                                           le64toh(o->data.n_entries),
2088                                           monotonic,
2089                                           test_object_monotonic,
2090                                           direction,
2091                                           NULL, &z, NULL);
2092         if (r <= 0)
2093                 return r;
2094
2095         /* And now, continue seeking until we find an entry that
2096          * exists in both bisection arrays */
2097
2098         for (;;) {
2099                 Object *qo;
2100                 uint64_t p, q;
2101
2102                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2103                 if (r < 0)
2104                         return r;
2105
2106                 r = generic_array_bisect_plus_one(f,
2107                                                   le64toh(d->data.entry_offset),
2108                                                   le64toh(d->data.entry_array_offset),
2109                                                   le64toh(d->data.n_entries),
2110                                                   z,
2111                                                   test_object_offset,
2112                                                   direction,
2113                                                   NULL, &p, NULL);
2114                 if (r <= 0)
2115                         return r;
2116
2117                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2118                 if (r < 0)
2119                         return r;
2120
2121                 r = generic_array_bisect_plus_one(f,
2122                                                   le64toh(o->data.entry_offset),
2123                                                   le64toh(o->data.entry_array_offset),
2124                                                   le64toh(o->data.n_entries),
2125                                                   p,
2126                                                   test_object_offset,
2127                                                   direction,
2128                                                   &qo, &q, NULL);
2129
2130                 if (r <= 0)
2131                         return r;
2132
2133                 if (p == q) {
2134                         if (ret)
2135                                 *ret = qo;
2136                         if (offset)
2137                                 *offset = q;
2138
2139                         return 1;
2140                 }
2141
2142                 z = q;
2143         }
2144
2145         return 0;
2146 }
2147
2148 int journal_file_move_to_entry_by_seqnum_for_data(
2149                 JournalFile *f,
2150                 uint64_t data_offset,
2151                 uint64_t seqnum,
2152                 direction_t direction,
2153                 Object **ret, uint64_t *offset) {
2154
2155         Object *d;
2156         int r;
2157
2158         assert(f);
2159
2160         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2161         if (r < 0)
2162                 return r;
2163
2164         return generic_array_bisect_plus_one(f,
2165                                              le64toh(d->data.entry_offset),
2166                                              le64toh(d->data.entry_array_offset),
2167                                              le64toh(d->data.n_entries),
2168                                              seqnum,
2169                                              test_object_seqnum,
2170                                              direction,
2171                                              ret, offset, NULL);
2172 }
2173
2174 int journal_file_move_to_entry_by_realtime_for_data(
2175                 JournalFile *f,
2176                 uint64_t data_offset,
2177                 uint64_t realtime,
2178                 direction_t direction,
2179                 Object **ret, uint64_t *offset) {
2180
2181         Object *d;
2182         int r;
2183
2184         assert(f);
2185
2186         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2187         if (r < 0)
2188                 return r;
2189
2190         return generic_array_bisect_plus_one(f,
2191                                              le64toh(d->data.entry_offset),
2192                                              le64toh(d->data.entry_array_offset),
2193                                              le64toh(d->data.n_entries),
2194                                              realtime,
2195                                              test_object_realtime,
2196                                              direction,
2197                                              ret, offset, NULL);
2198 }
2199
2200 void journal_file_dump(JournalFile *f) {
2201         Object *o;
2202         int r;
2203         uint64_t p;
2204
2205         assert(f);
2206
2207         journal_file_print_header(f);
2208
2209         p = le64toh(f->header->header_size);
2210         while (p != 0) {
2211                 r = journal_file_move_to_object(f, -1, p, &o);
2212                 if (r < 0)
2213                         goto fail;
2214
2215                 switch (o->object.type) {
2216
2217                 case OBJECT_UNUSED:
2218                         printf("Type: OBJECT_UNUSED\n");
2219                         break;
2220
2221                 case OBJECT_DATA:
2222                         printf("Type: OBJECT_DATA\n");
2223                         break;
2224
2225                 case OBJECT_FIELD:
2226                         printf("Type: OBJECT_FIELD\n");
2227                         break;
2228
2229                 case OBJECT_ENTRY:
2230                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2231                                le64toh(o->entry.seqnum),
2232                                le64toh(o->entry.monotonic),
2233                                le64toh(o->entry.realtime));
2234                         break;
2235
2236                 case OBJECT_FIELD_HASH_TABLE:
2237                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2238                         break;
2239
2240                 case OBJECT_DATA_HASH_TABLE:
2241                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2242                         break;
2243
2244                 case OBJECT_ENTRY_ARRAY:
2245                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2246                         break;
2247
2248                 case OBJECT_TAG:
2249                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2250                                le64toh(o->tag.seqnum),
2251                                le64toh(o->tag.epoch));
2252                         break;
2253
2254                 default:
2255                         printf("Type: unknown (%u)\n", o->object.type);
2256                         break;
2257                 }
2258
2259                 if (o->object.flags & OBJECT_COMPRESSED)
2260                         printf("Flags: COMPRESSED\n");
2261
2262                 if (p == le64toh(f->header->tail_object_offset))
2263                         p = 0;
2264                 else
2265                         p = p + ALIGN64(le64toh(o->object.size));
2266         }
2267
2268         return;
2269 fail:
2270         log_error("File corrupt");
2271 }
2272
2273 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2274         const char *x;
2275
2276         x = format_timestamp(buf, l, t);
2277         if (x)
2278                 return x;
2279         return " --- ";
2280 }
2281
2282 void journal_file_print_header(JournalFile *f) {
2283         char a[33], b[33], c[33], d[33];
2284         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2285         struct stat st;
2286         char bytes[FORMAT_BYTES_MAX];
2287
2288         assert(f);
2289
2290         printf("File Path: %s\n"
2291                "File ID: %s\n"
2292                "Machine ID: %s\n"
2293                "Boot ID: %s\n"
2294                "Sequential Number ID: %s\n"
2295                "State: %s\n"
2296                "Compatible Flags:%s%s\n"
2297                "Incompatible Flags:%s%s\n"
2298                "Header size: %"PRIu64"\n"
2299                "Arena size: %"PRIu64"\n"
2300                "Data Hash Table Size: %"PRIu64"\n"
2301                "Field Hash Table Size: %"PRIu64"\n"
2302                "Rotate Suggested: %s\n"
2303                "Head Sequential Number: %"PRIu64"\n"
2304                "Tail Sequential Number: %"PRIu64"\n"
2305                "Head Realtime Timestamp: %s\n"
2306                "Tail Realtime Timestamp: %s\n"
2307                "Tail Monotonic Timestamp: %s\n"
2308                "Objects: %"PRIu64"\n"
2309                "Entry Objects: %"PRIu64"\n",
2310                f->path,
2311                sd_id128_to_string(f->header->file_id, a),
2312                sd_id128_to_string(f->header->machine_id, b),
2313                sd_id128_to_string(f->header->boot_id, c),
2314                sd_id128_to_string(f->header->seqnum_id, d),
2315                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2316                f->header->state == STATE_ONLINE ? "ONLINE" :
2317                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2318                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2319                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2320                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2321                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2322                le64toh(f->header->header_size),
2323                le64toh(f->header->arena_size),
2324                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2325                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2326                yes_no(journal_file_rotate_suggested(f, 0)),
2327                le64toh(f->header->head_entry_seqnum),
2328                le64toh(f->header->tail_entry_seqnum),
2329                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2330                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2331                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2332                le64toh(f->header->n_objects),
2333                le64toh(f->header->n_entries));
2334
2335         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2336                 printf("Data Objects: %"PRIu64"\n"
2337                        "Data Hash Table Fill: %.1f%%\n",
2338                        le64toh(f->header->n_data),
2339                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2340
2341         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2342                 printf("Field Objects: %"PRIu64"\n"
2343                        "Field Hash Table Fill: %.1f%%\n",
2344                        le64toh(f->header->n_fields),
2345                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2346
2347         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2348                 printf("Tag Objects: %"PRIu64"\n",
2349                        le64toh(f->header->n_tags));
2350         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2351                 printf("Entry Array Objects: %"PRIu64"\n",
2352                        le64toh(f->header->n_entry_arrays));
2353
2354         if (fstat(f->fd, &st) >= 0)
2355                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2356 }
2357
2358 int journal_file_open(
2359                 const char *fname,
2360                 int flags,
2361                 mode_t mode,
2362                 bool compress,
2363                 bool seal,
2364                 JournalMetrics *metrics,
2365                 MMapCache *mmap_cache,
2366                 JournalFile *template,
2367                 JournalFile **ret) {
2368
2369         JournalFile *f;
2370         int r;
2371         bool newly_created = false;
2372
2373         assert(fname);
2374         assert(ret);
2375
2376         if ((flags & O_ACCMODE) != O_RDONLY &&
2377             (flags & O_ACCMODE) != O_RDWR)
2378                 return -EINVAL;
2379
2380         if (!endswith(fname, ".journal") &&
2381             !endswith(fname, ".journal~"))
2382                 return -EINVAL;
2383
2384         f = new0(JournalFile, 1);
2385         if (!f)
2386                 return -ENOMEM;
2387
2388         f->fd = -1;
2389         f->mode = mode;
2390
2391         f->flags = flags;
2392         f->prot = prot_from_flags(flags);
2393         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2394 #ifdef HAVE_XZ
2395         f->compress = compress;
2396 #endif
2397 #ifdef HAVE_GCRYPT
2398         f->seal = seal;
2399 #endif
2400
2401         if (mmap_cache)
2402                 f->mmap = mmap_cache_ref(mmap_cache);
2403         else {
2404                 f->mmap = mmap_cache_new();
2405                 if (!f->mmap) {
2406                         r = -ENOMEM;
2407                         goto fail;
2408                 }
2409         }
2410
2411         f->path = strdup(fname);
2412         if (!f->path) {
2413                 r = -ENOMEM;
2414                 goto fail;
2415         }
2416
2417         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2418         if (!f->chain_cache) {
2419                 r = -ENOMEM;
2420                 goto fail;
2421         }
2422
2423         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2424         if (f->fd < 0) {
2425                 r = -errno;
2426                 goto fail;
2427         }
2428
2429         if (fstat(f->fd, &f->last_stat) < 0) {
2430                 r = -errno;
2431                 goto fail;
2432         }
2433
2434         if (f->last_stat.st_size == 0 && f->writable) {
2435 #ifdef HAVE_XATTR
2436                 uint64_t crtime;
2437
2438                 /* Let's attach the creation time to the journal file,
2439                  * so that the vacuuming code knows the age of this
2440                  * file even if the file might end up corrupted one
2441                  * day... Ideally we'd just use the creation time many
2442                  * file systems maintain for each file, but there is
2443                  * currently no usable API to query this, hence let's
2444                  * emulate this via extended attributes. If extended
2445                  * attributes are not supported we'll just skip this,
2446                  * and rely solely on mtime/atime/ctime of the file.*/
2447
2448                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2449                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2450 #endif
2451
2452 #ifdef HAVE_GCRYPT
2453                 /* Try to load the FSPRG state, and if we can't, then
2454                  * just don't do sealing */
2455                 if (f->seal) {
2456                         r = journal_file_fss_load(f);
2457                         if (r < 0)
2458                                 f->seal = false;
2459                 }
2460 #endif
2461
2462                 r = journal_file_init_header(f, template);
2463                 if (r < 0)
2464                         goto fail;
2465
2466                 if (fstat(f->fd, &f->last_stat) < 0) {
2467                         r = -errno;
2468                         goto fail;
2469                 }
2470
2471                 newly_created = true;
2472         }
2473
2474         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2475                 r = -EIO;
2476                 goto fail;
2477         }
2478
2479         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2480         if (f->header == MAP_FAILED) {
2481                 f->header = NULL;
2482                 r = -errno;
2483                 goto fail;
2484         }
2485
2486         if (!newly_created) {
2487                 r = journal_file_verify_header(f);
2488                 if (r < 0)
2489                         goto fail;
2490         }
2491
2492 #ifdef HAVE_GCRYPT
2493         if (!newly_created && f->writable) {
2494                 r = journal_file_fss_load(f);
2495                 if (r < 0)
2496                         goto fail;
2497         }
2498 #endif
2499
2500         if (f->writable) {
2501                 if (metrics) {
2502                         journal_default_metrics(metrics, f->fd);
2503                         f->metrics = *metrics;
2504                 } else if (template)
2505                         f->metrics = template->metrics;
2506
2507                 r = journal_file_refresh_header(f);
2508                 if (r < 0)
2509                         goto fail;
2510         }
2511
2512 #ifdef HAVE_GCRYPT
2513         r = journal_file_hmac_setup(f);
2514         if (r < 0)
2515                 goto fail;
2516 #endif
2517
2518         if (newly_created) {
2519                 r = journal_file_setup_field_hash_table(f);
2520                 if (r < 0)
2521                         goto fail;
2522
2523                 r = journal_file_setup_data_hash_table(f);
2524                 if (r < 0)
2525                         goto fail;
2526
2527 #ifdef HAVE_GCRYPT
2528                 r = journal_file_append_first_tag(f);
2529                 if (r < 0)
2530                         goto fail;
2531 #endif
2532         }
2533
2534         r = journal_file_map_field_hash_table(f);
2535         if (r < 0)
2536                 goto fail;
2537
2538         r = journal_file_map_data_hash_table(f);
2539         if (r < 0)
2540                 goto fail;
2541
2542         *ret = f;
2543         return 0;
2544
2545 fail:
2546         journal_file_close(f);
2547
2548         return r;
2549 }
2550
2551 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2552         char *p;
2553         size_t l;
2554         JournalFile *old_file, *new_file = NULL;
2555         int r;
2556
2557         assert(f);
2558         assert(*f);
2559
2560         old_file = *f;
2561
2562         if (!old_file->writable)
2563                 return -EINVAL;
2564
2565         if (!endswith(old_file->path, ".journal"))
2566                 return -EINVAL;
2567
2568         l = strlen(old_file->path);
2569
2570         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2571         if (!p)
2572                 return -ENOMEM;
2573
2574         memcpy(p, old_file->path, l - 8);
2575         p[l-8] = '@';
2576         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2577         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2578                  "-%016"PRIx64"-%016"PRIx64".journal",
2579                  le64toh((*f)->header->head_entry_seqnum),
2580                  le64toh((*f)->header->head_entry_realtime));
2581
2582         r = rename(old_file->path, p);
2583         free(p);
2584
2585         if (r < 0)
2586                 return -errno;
2587
2588         old_file->header->state = STATE_ARCHIVED;
2589
2590         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2591         journal_file_close(old_file);
2592
2593         *f = new_file;
2594         return r;
2595 }
2596
2597 int journal_file_open_reliably(
2598                 const char *fname,
2599                 int flags,
2600                 mode_t mode,
2601                 bool compress,
2602                 bool seal,
2603                 JournalMetrics *metrics,
2604                 MMapCache *mmap_cache,
2605                 JournalFile *template,
2606                 JournalFile **ret) {
2607
2608         int r;
2609         size_t l;
2610         _cleanup_free_ char *p = NULL;
2611
2612         r = journal_file_open(fname, flags, mode, compress, seal,
2613                               metrics, mmap_cache, template, ret);
2614         if (r != -EBADMSG && /* corrupted */
2615             r != -ENODATA && /* truncated */
2616             r != -EHOSTDOWN && /* other machine */
2617             r != -EPROTONOSUPPORT && /* incompatible feature */
2618             r != -EBUSY && /* unclean shutdown */
2619             r != -ESHUTDOWN /* already archived */)
2620                 return r;
2621
2622         if ((flags & O_ACCMODE) == O_RDONLY)
2623                 return r;
2624
2625         if (!(flags & O_CREAT))
2626                 return r;
2627
2628         if (!endswith(fname, ".journal"))
2629                 return r;
2630
2631         /* The file is corrupted. Rotate it away and try it again (but only once) */
2632
2633         l = strlen(fname);
2634         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2635                      (int) (l-8), fname,
2636                      (unsigned long long) now(CLOCK_REALTIME),
2637                      random_ull()) < 0)
2638                 return -ENOMEM;
2639
2640         r = rename(fname, p);
2641         if (r < 0)
2642                 return -errno;
2643
2644         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2645
2646         return journal_file_open(fname, flags, mode, compress, seal,
2647                                  metrics, mmap_cache, template, ret);
2648 }
2649
2650 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2651         uint64_t i, n;
2652         uint64_t q, xor_hash = 0;
2653         int r;
2654         EntryItem *items;
2655         dual_timestamp ts;
2656
2657         assert(from);
2658         assert(to);
2659         assert(o);
2660         assert(p);
2661
2662         if (!to->writable)
2663                 return -EPERM;
2664
2665         ts.monotonic = le64toh(o->entry.monotonic);
2666         ts.realtime = le64toh(o->entry.realtime);
2667
2668         if (to->tail_entry_monotonic_valid &&
2669             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2670                 return -EINVAL;
2671
2672         n = journal_file_entry_n_items(o);
2673         items = alloca(sizeof(EntryItem) * n);
2674
2675         for (i = 0; i < n; i++) {
2676                 uint64_t l, h;
2677                 le64_t le_hash;
2678                 size_t t;
2679                 void *data;
2680                 Object *u;
2681
2682                 q = le64toh(o->entry.items[i].object_offset);
2683                 le_hash = o->entry.items[i].hash;
2684
2685                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2686                 if (r < 0)
2687                         return r;
2688
2689                 if (le_hash != o->data.hash)
2690                         return -EBADMSG;
2691
2692                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2693                 t = (size_t) l;
2694
2695                 /* We hit the limit on 32bit machines */
2696                 if ((uint64_t) t != l)
2697                         return -E2BIG;
2698
2699                 if (o->object.flags & OBJECT_COMPRESSED) {
2700 #ifdef HAVE_XZ
2701                         uint64_t rsize;
2702
2703                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2704                                 return -EBADMSG;
2705
2706                         data = from->compress_buffer;
2707                         l = rsize;
2708 #else
2709                         return -EPROTONOSUPPORT;
2710 #endif
2711                 } else
2712                         data = o->data.payload;
2713
2714                 r = journal_file_append_data(to, data, l, &u, &h);
2715                 if (r < 0)
2716                         return r;
2717
2718                 xor_hash ^= le64toh(u->data.hash);
2719                 items[i].object_offset = htole64(h);
2720                 items[i].hash = u->data.hash;
2721
2722                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2723                 if (r < 0)
2724                         return r;
2725         }
2726
2727         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2728 }
2729
2730 void journal_default_metrics(JournalMetrics *m, int fd) {
2731         uint64_t fs_size = 0;
2732         struct statvfs ss;
2733         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2734
2735         assert(m);
2736         assert(fd >= 0);
2737
2738         if (fstatvfs(fd, &ss) >= 0)
2739                 fs_size = ss.f_frsize * ss.f_blocks;
2740
2741         if (m->max_use == (uint64_t) -1) {
2742
2743                 if (fs_size > 0) {
2744                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2745
2746                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2747                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2748
2749                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2750                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2751                 } else
2752                         m->max_use = DEFAULT_MAX_USE_LOWER;
2753         } else {
2754                 m->max_use = PAGE_ALIGN(m->max_use);
2755
2756                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2757                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2758         }
2759
2760         if (m->max_size == (uint64_t) -1) {
2761                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2762
2763                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2764                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2765         } else
2766                 m->max_size = PAGE_ALIGN(m->max_size);
2767
2768         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2769                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2770
2771         if (m->max_size*2 > m->max_use)
2772                 m->max_use = m->max_size*2;
2773
2774         if (m->min_size == (uint64_t) -1)
2775                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2776         else {
2777                 m->min_size = PAGE_ALIGN(m->min_size);
2778
2779                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2780                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2781
2782                 if (m->min_size > m->max_size)
2783                         m->max_size = m->min_size;
2784         }
2785
2786         if (m->keep_free == (uint64_t) -1) {
2787
2788                 if (fs_size > 0) {
2789                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2790
2791                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2792                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2793
2794                 } else
2795                         m->keep_free = DEFAULT_KEEP_FREE;
2796         }
2797
2798         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2799                   format_bytes(a, sizeof(a), m->max_use),
2800                   format_bytes(b, sizeof(b), m->max_size),
2801                   format_bytes(c, sizeof(c), m->min_size),
2802                   format_bytes(d, sizeof(d), m->keep_free));
2803 }
2804
2805 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2806         assert(f);
2807         assert(from || to);
2808
2809         if (from) {
2810                 if (f->header->head_entry_realtime == 0)
2811                         return -ENOENT;
2812
2813                 *from = le64toh(f->header->head_entry_realtime);
2814         }
2815
2816         if (to) {
2817                 if (f->header->tail_entry_realtime == 0)
2818                         return -ENOENT;
2819
2820                 *to = le64toh(f->header->tail_entry_realtime);
2821         }
2822
2823         return 1;
2824 }
2825
2826 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2827         Object *o;
2828         uint64_t p;
2829         int r;
2830
2831         assert(f);
2832         assert(from || to);
2833
2834         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2835         if (r <= 0)
2836                 return r;
2837
2838         if (le64toh(o->data.n_entries) <= 0)
2839                 return 0;
2840
2841         if (from) {
2842                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2843                 if (r < 0)
2844                         return r;
2845
2846                 *from = le64toh(o->entry.monotonic);
2847         }
2848
2849         if (to) {
2850                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2851                 if (r < 0)
2852                         return r;
2853
2854                 r = generic_array_get_plus_one(f,
2855                                                le64toh(o->data.entry_offset),
2856                                                le64toh(o->data.entry_array_offset),
2857                                                le64toh(o->data.n_entries)-1,
2858                                                &o, NULL);
2859                 if (r <= 0)
2860                         return r;
2861
2862                 *to = le64toh(o->entry.monotonic);
2863         }
2864
2865         return 1;
2866 }
2867
2868 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2869         assert(f);
2870
2871         /* If we gained new header fields we gained new features,
2872          * hence suggest a rotation */
2873         if (le64toh(f->header->header_size) < sizeof(Header)) {
2874                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2875                 return true;
2876         }
2877
2878         /* Let's check if the hash tables grew over a certain fill
2879          * level (75%, borrowing this value from Java's hash table
2880          * implementation), and if so suggest a rotation. To calculate
2881          * the fill level we need the n_data field, which only exists
2882          * in newer versions. */
2883
2884         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2885                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2886                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2887                                   f->path,
2888                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2889                                   le64toh(f->header->n_data),
2890                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2891                                   (unsigned long long) f->last_stat.st_size,
2892                                   f->last_stat.st_size / le64toh(f->header->n_data));
2893                         return true;
2894                 }
2895
2896         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2897                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2898                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2899                                   f->path,
2900                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2901                                   le64toh(f->header->n_fields),
2902                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2903                         return true;
2904                 }
2905
2906         /* Are the data objects properly indexed by field objects? */
2907         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2908             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2909             le64toh(f->header->n_data) > 0 &&
2910             le64toh(f->header->n_fields) == 0)
2911                 return true;
2912
2913         if (max_file_usec > 0) {
2914                 usec_t t, h;
2915
2916                 h = le64toh(f->header->head_entry_realtime);
2917                 t = now(CLOCK_REALTIME);
2918
2919                 if (h > 0 && t > h + max_file_usec)
2920                         return true;
2921         }
2922
2923         return false;
2924 }