chiark / gitweb /
27cd16fc007f72fded4907bbd00fdcc0c36d340e
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75         assert(f);
76
77         if (!f->writable)
78                 return -EPERM;
79
80         if (!(f->fd >= 0 && f->header))
81                 return -EINVAL;
82
83         switch(f->header->state) {
84                 case STATE_ONLINE:
85                         return 0;
86
87                 case STATE_OFFLINE:
88                         f->header->state = STATE_ONLINE;
89                         fsync(f->fd);
90                         return 0;
91
92                 default:
93                         return -EINVAL;
94         }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98         assert(f);
99
100         if (!f->writable)
101                 return -EPERM;
102
103         if (!(f->fd >= 0 && f->header))
104                 return -EINVAL;
105
106         if (f->header->state != STATE_ONLINE)
107                 return 0;
108
109         fsync(f->fd);
110
111         f->header->state = STATE_OFFLINE;
112
113         fsync(f->fd);
114
115         return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119         assert(f);
120
121 #ifdef HAVE_GCRYPT
122         /* Write the final tag */
123         if (f->seal && f->writable)
124                 journal_file_append_tag(f);
125 #endif
126
127         /* Sync everything to disk, before we mark the file offline */
128         if (f->mmap && f->fd >= 0)
129                 mmap_cache_close_fd(f->mmap, f->fd);
130
131         journal_file_set_offline(f);
132
133         if (f->header)
134                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136         if (f->fd >= 0)
137                 close_nointr_nofail(f->fd);
138
139         free(f->path);
140
141         if (f->mmap)
142                 mmap_cache_unref(f->mmap);
143
144         hashmap_free_free(f->chain_cache);
145
146 #ifdef HAVE_XZ
147         free(f->compress_buffer);
148 #endif
149
150 #ifdef HAVE_GCRYPT
151         if (f->fss_file)
152                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153         else if (f->fsprg_state)
154                 free(f->fsprg_state);
155
156         free(f->fsprg_seed);
157
158         if (f->hmac)
159                 gcry_md_close(f->hmac);
160 #endif
161
162         free(f);
163 }
164
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166         Header h;
167         ssize_t k;
168         int r;
169
170         assert(f);
171
172         zero(h);
173         memcpy(h.signature, HEADER_SIGNATURE, 8);
174         h.header_size = htole64(ALIGN64(sizeof(h)));
175
176         h.incompatible_flags =
177                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179         h.compatible_flags =
180                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
181
182         r = sd_id128_randomize(&h.file_id);
183         if (r < 0)
184                 return r;
185
186         if (template) {
187                 h.seqnum_id = template->header->seqnum_id;
188                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
189         } else
190                 h.seqnum_id = h.file_id;
191
192         k = pwrite(f->fd, &h, sizeof(h), 0);
193         if (k < 0)
194                 return -errno;
195
196         if (k != sizeof(h))
197                 return -EIO;
198
199         return 0;
200 }
201
202 static int journal_file_refresh_header(JournalFile *f) {
203         int r;
204         sd_id128_t boot_id;
205
206         assert(f);
207
208         r = sd_id128_get_machine(&f->header->machine_id);
209         if (r < 0)
210                 return r;
211
212         r = sd_id128_get_boot(&boot_id);
213         if (r < 0)
214                 return r;
215
216         if (sd_id128_equal(boot_id, f->header->boot_id))
217                 f->tail_entry_monotonic_valid = true;
218
219         f->header->boot_id = boot_id;
220
221         journal_file_set_online(f);
222
223         /* Sync the online state to disk */
224         fsync(f->fd);
225
226         return 0;
227 }
228
229 static int journal_file_verify_header(JournalFile *f) {
230         assert(f);
231
232         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233                 return -EBADMSG;
234
235         /* In both read and write mode we refuse to open files with
236          * incompatible flags we don't know */
237 #ifdef HAVE_XZ
238         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239                 return -EPROTONOSUPPORT;
240 #else
241         if (f->header->incompatible_flags != 0)
242                 return -EPROTONOSUPPORT;
243 #endif
244
245         /* When open for writing we refuse to open files with
246          * compatible flags, too */
247         if (f->writable) {
248 #ifdef HAVE_GCRYPT
249                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250                         return -EPROTONOSUPPORT;
251 #else
252                 if (f->header->compatible_flags != 0)
253                         return -EPROTONOSUPPORT;
254 #endif
255         }
256
257         if (f->header->state >= _STATE_MAX)
258                 return -EBADMSG;
259
260         /* The first addition was n_data, so check that we are at least this large */
261         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262                 return -EBADMSG;
263
264         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265                 return -EBADMSG;
266
267         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268                 return -ENODATA;
269
270         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271                 return -ENODATA;
272
273         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275             !VALID64(le64toh(f->header->tail_object_offset)) ||
276             !VALID64(le64toh(f->header->entry_array_offset)))
277                 return -ENODATA;
278
279         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
283                 return -ENODATA;
284
285         if (f->writable) {
286                 uint8_t state;
287                 sd_id128_t machine_id;
288                 int r;
289
290                 r = sd_id128_get_machine(&machine_id);
291                 if (r < 0)
292                         return r;
293
294                 if (!sd_id128_equal(machine_id, f->header->machine_id))
295                         return -EHOSTDOWN;
296
297                 state = f->header->state;
298
299                 if (state == STATE_ONLINE) {
300                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301                         return -EBUSY;
302                 } else if (state == STATE_ARCHIVED)
303                         return -ESHUTDOWN;
304                 else if (state != STATE_OFFLINE) {
305                         log_debug("Journal file %s has unknown state %u.", f->path, state);
306                         return -EBUSY;
307                 }
308         }
309
310         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
311
312         f->seal = JOURNAL_HEADER_SEALED(f->header);
313
314         return 0;
315 }
316
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318         uint64_t old_size, new_size, file_size;
319         int r;
320
321         assert(f);
322
323         /* We assume that this file is not sparse, and we know that
324          * for sure, since we always call posix_fallocate()
325          * ourselves */
326
327         old_size =
328                 le64toh(f->header->header_size) +
329                 le64toh(f->header->arena_size);
330
331         new_size = PAGE_ALIGN(offset + size);
332         if (new_size < le64toh(f->header->header_size))
333                 new_size = le64toh(f->header->header_size);
334
335         if (new_size <= old_size)
336                 return 0;
337
338         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339                 return -E2BIG;
340
341         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Note that the glibc fallocate() fallback is very
360            inefficient, hence we try to minimize the allocation area
361            as we can. */
362         r = posix_fallocate(f->fd, old_size, new_size - old_size);
363         if (r != 0)
364                 return -r;
365
366         /* Increase the file size a bit further than this, so that we
367          * we can create larger memory maps to cache */
368         file_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369         if (file_size > (uint64_t) f->last_stat.st_size) {
370                 if (file_size > new_size)
371                         ftruncate(f->fd, file_size);
372
373                 if (fstat(f->fd, &f->last_stat) < 0)
374                         return -errno;
375         }
376
377         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
378
379         return 0;
380 }
381
382 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
383         assert(f);
384         assert(ret);
385
386         if (size <= 0)
387                 return -EINVAL;
388
389         /* Avoid SIGBUS on invalid accesses */
390         if (offset + size > (uint64_t) f->last_stat.st_size) {
391                 /* Hmm, out of range? Let's refresh the fstat() data
392                  * first, before we trust that check. */
393
394                 if (fstat(f->fd, &f->last_stat) < 0 ||
395                     offset + size > (uint64_t) f->last_stat.st_size)
396                         return -EADDRNOTAVAIL;
397         }
398
399         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
400 }
401
402 static uint64_t minimum_header_size(Object *o) {
403
404         static const uint64_t table[] = {
405                 [OBJECT_DATA] = sizeof(DataObject),
406                 [OBJECT_FIELD] = sizeof(FieldObject),
407                 [OBJECT_ENTRY] = sizeof(EntryObject),
408                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411                 [OBJECT_TAG] = sizeof(TagObject),
412         };
413
414         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415                 return sizeof(ObjectHeader);
416
417         return table[o->object.type];
418 }
419
420 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
421         int r;
422         void *t;
423         Object *o;
424         uint64_t s;
425         unsigned context;
426
427         assert(f);
428         assert(ret);
429
430         /* Objects may only be located at multiple of 64 bit */
431         if (!VALID64(offset))
432                 return -EFAULT;
433
434         /* One context for each type, plus one catch-all for the rest */
435         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
436
437         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
438         if (r < 0)
439                 return r;
440
441         o = (Object*) t;
442         s = le64toh(o->object.size);
443
444         if (s < sizeof(ObjectHeader))
445                 return -EBADMSG;
446
447         if (o->object.type <= OBJECT_UNUSED)
448                 return -EBADMSG;
449
450         if (s < minimum_header_size(o))
451                 return -EBADMSG;
452
453         if (type > 0 && o->object.type != type)
454                 return -EBADMSG;
455
456         if (s > sizeof(ObjectHeader)) {
457                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
458                 if (r < 0)
459                         return r;
460
461                 o = (Object*) t;
462         }
463
464         *ret = o;
465         return 0;
466 }
467
468 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
469         uint64_t r;
470
471         assert(f);
472
473         r = le64toh(f->header->tail_entry_seqnum) + 1;
474
475         if (seqnum) {
476                 /* If an external seqnum counter was passed, we update
477                  * both the local and the external one, and set it to
478                  * the maximum of both */
479
480                 if (*seqnum + 1 > r)
481                         r = *seqnum + 1;
482
483                 *seqnum = r;
484         }
485
486         f->header->tail_entry_seqnum = htole64(r);
487
488         if (f->header->head_entry_seqnum == 0)
489                 f->header->head_entry_seqnum = htole64(r);
490
491         return r;
492 }
493
494 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
495         int r;
496         uint64_t p;
497         Object *tail, *o;
498         void *t;
499
500         assert(f);
501         assert(type > 0 && type < _OBJECT_TYPE_MAX);
502         assert(size >= sizeof(ObjectHeader));
503         assert(offset);
504         assert(ret);
505
506         r = journal_file_set_online(f);
507         if (r < 0)
508                 return r;
509
510         p = le64toh(f->header->tail_object_offset);
511         if (p == 0)
512                 p = le64toh(f->header->header_size);
513         else {
514                 r = journal_file_move_to_object(f, -1, p, &tail);
515                 if (r < 0)
516                         return r;
517
518                 p += ALIGN64(le64toh(tail->object.size));
519         }
520
521         r = journal_file_allocate(f, p, size);
522         if (r < 0)
523                 return r;
524
525         r = journal_file_move_to(f, type, false, p, size, &t);
526         if (r < 0)
527                 return r;
528
529         o = (Object*) t;
530
531         zero(o->object);
532         o->object.type = type;
533         o->object.size = htole64(size);
534
535         f->header->tail_object_offset = htole64(p);
536         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
537
538         *ret = o;
539         *offset = p;
540
541         return 0;
542 }
543
544 static int journal_file_setup_data_hash_table(JournalFile *f) {
545         uint64_t s, p;
546         Object *o;
547         int r;
548
549         assert(f);
550
551         /* We estimate that we need 1 hash table entry per 768 of
552            journal file and we want to make sure we never get beyond
553            75% fill level. Calculate the hash table size for the
554            maximum file size based on these metrics. */
555
556         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
557         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
558                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
559
560         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
561
562         r = journal_file_append_object(f,
563                                        OBJECT_DATA_HASH_TABLE,
564                                        offsetof(Object, hash_table.items) + s,
565                                        &o, &p);
566         if (r < 0)
567                 return r;
568
569         memset(o->hash_table.items, 0, s);
570
571         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
572         f->header->data_hash_table_size = htole64(s);
573
574         return 0;
575 }
576
577 static int journal_file_setup_field_hash_table(JournalFile *f) {
578         uint64_t s, p;
579         Object *o;
580         int r;
581
582         assert(f);
583
584         /* We use a fixed size hash table for the fields as this
585          * number should grow very slowly only */
586
587         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
588         r = journal_file_append_object(f,
589                                        OBJECT_FIELD_HASH_TABLE,
590                                        offsetof(Object, hash_table.items) + s,
591                                        &o, &p);
592         if (r < 0)
593                 return r;
594
595         memset(o->hash_table.items, 0, s);
596
597         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
598         f->header->field_hash_table_size = htole64(s);
599
600         return 0;
601 }
602
603 static int journal_file_map_data_hash_table(JournalFile *f) {
604         uint64_t s, p;
605         void *t;
606         int r;
607
608         assert(f);
609
610         p = le64toh(f->header->data_hash_table_offset);
611         s = le64toh(f->header->data_hash_table_size);
612
613         r = journal_file_move_to(f,
614                                  OBJECT_DATA_HASH_TABLE,
615                                  true,
616                                  p, s,
617                                  &t);
618         if (r < 0)
619                 return r;
620
621         f->data_hash_table = t;
622         return 0;
623 }
624
625 static int journal_file_map_field_hash_table(JournalFile *f) {
626         uint64_t s, p;
627         void *t;
628         int r;
629
630         assert(f);
631
632         p = le64toh(f->header->field_hash_table_offset);
633         s = le64toh(f->header->field_hash_table_size);
634
635         r = journal_file_move_to(f,
636                                  OBJECT_FIELD_HASH_TABLE,
637                                  true,
638                                  p, s,
639                                  &t);
640         if (r < 0)
641                 return r;
642
643         f->field_hash_table = t;
644         return 0;
645 }
646
647 static int journal_file_link_field(
648                 JournalFile *f,
649                 Object *o,
650                 uint64_t offset,
651                 uint64_t hash) {
652
653         uint64_t p, h;
654         int r;
655
656         assert(f);
657         assert(o);
658         assert(offset > 0);
659
660         if (o->object.type != OBJECT_FIELD)
661                 return -EINVAL;
662
663         /* This might alter the window we are looking at */
664
665         o->field.next_hash_offset = o->field.head_data_offset = 0;
666
667         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
668         p = le64toh(f->field_hash_table[h].tail_hash_offset);
669         if (p == 0)
670                 f->field_hash_table[h].head_hash_offset = htole64(offset);
671         else {
672                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
673                 if (r < 0)
674                         return r;
675
676                 o->field.next_hash_offset = htole64(offset);
677         }
678
679         f->field_hash_table[h].tail_hash_offset = htole64(offset);
680
681         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
682                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
683
684         return 0;
685 }
686
687 static int journal_file_link_data(
688                 JournalFile *f,
689                 Object *o,
690                 uint64_t offset,
691                 uint64_t hash) {
692
693         uint64_t p, h;
694         int r;
695
696         assert(f);
697         assert(o);
698         assert(offset > 0);
699
700         if (o->object.type != OBJECT_DATA)
701                 return -EINVAL;
702
703         /* This might alter the window we are looking at */
704
705         o->data.next_hash_offset = o->data.next_field_offset = 0;
706         o->data.entry_offset = o->data.entry_array_offset = 0;
707         o->data.n_entries = 0;
708
709         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
710         p = le64toh(f->data_hash_table[h].tail_hash_offset);
711         if (p == 0)
712                 /* Only entry in the hash table is easy */
713                 f->data_hash_table[h].head_hash_offset = htole64(offset);
714         else {
715                 /* Move back to the previous data object, to patch in
716                  * pointer */
717
718                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
719                 if (r < 0)
720                         return r;
721
722                 o->data.next_hash_offset = htole64(offset);
723         }
724
725         f->data_hash_table[h].tail_hash_offset = htole64(offset);
726
727         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
728                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
729
730         return 0;
731 }
732
733 int journal_file_find_field_object_with_hash(
734                 JournalFile *f,
735                 const void *field, uint64_t size, uint64_t hash,
736                 Object **ret, uint64_t *offset) {
737
738         uint64_t p, osize, h;
739         int r;
740
741         assert(f);
742         assert(field && size > 0);
743
744         osize = offsetof(Object, field.payload) + size;
745
746         if (f->header->field_hash_table_size == 0)
747                 return -EBADMSG;
748
749         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
750         p = le64toh(f->field_hash_table[h].head_hash_offset);
751
752         while (p > 0) {
753                 Object *o;
754
755                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
756                 if (r < 0)
757                         return r;
758
759                 if (le64toh(o->field.hash) == hash &&
760                     le64toh(o->object.size) == osize &&
761                     memcmp(o->field.payload, field, size) == 0) {
762
763                         if (ret)
764                                 *ret = o;
765                         if (offset)
766                                 *offset = p;
767
768                         return 1;
769                 }
770
771                 p = le64toh(o->field.next_hash_offset);
772         }
773
774         return 0;
775 }
776
777 int journal_file_find_field_object(
778                 JournalFile *f,
779                 const void *field, uint64_t size,
780                 Object **ret, uint64_t *offset) {
781
782         uint64_t hash;
783
784         assert(f);
785         assert(field && size > 0);
786
787         hash = hash64(field, size);
788
789         return journal_file_find_field_object_with_hash(f,
790                                                         field, size, hash,
791                                                         ret, offset);
792 }
793
794 int journal_file_find_data_object_with_hash(
795                 JournalFile *f,
796                 const void *data, uint64_t size, uint64_t hash,
797                 Object **ret, uint64_t *offset) {
798
799         uint64_t p, osize, h;
800         int r;
801
802         assert(f);
803         assert(data || size == 0);
804
805         osize = offsetof(Object, data.payload) + size;
806
807         if (f->header->data_hash_table_size == 0)
808                 return -EBADMSG;
809
810         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
811         p = le64toh(f->data_hash_table[h].head_hash_offset);
812
813         while (p > 0) {
814                 Object *o;
815
816                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
817                 if (r < 0)
818                         return r;
819
820                 if (le64toh(o->data.hash) != hash)
821                         goto next;
822
823                 if (o->object.flags & OBJECT_COMPRESSED) {
824 #ifdef HAVE_XZ
825                         uint64_t l, rsize;
826
827                         l = le64toh(o->object.size);
828                         if (l <= offsetof(Object, data.payload))
829                                 return -EBADMSG;
830
831                         l -= offsetof(Object, data.payload);
832
833                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
834                                 return -EBADMSG;
835
836                         if (rsize == size &&
837                             memcmp(f->compress_buffer, data, size) == 0) {
838
839                                 if (ret)
840                                         *ret = o;
841
842                                 if (offset)
843                                         *offset = p;
844
845                                 return 1;
846                         }
847 #else
848                         return -EPROTONOSUPPORT;
849 #endif
850
851                 } else if (le64toh(o->object.size) == osize &&
852                            memcmp(o->data.payload, data, size) == 0) {
853
854                         if (ret)
855                                 *ret = o;
856
857                         if (offset)
858                                 *offset = p;
859
860                         return 1;
861                 }
862
863         next:
864                 p = le64toh(o->data.next_hash_offset);
865         }
866
867         return 0;
868 }
869
870 int journal_file_find_data_object(
871                 JournalFile *f,
872                 const void *data, uint64_t size,
873                 Object **ret, uint64_t *offset) {
874
875         uint64_t hash;
876
877         assert(f);
878         assert(data || size == 0);
879
880         hash = hash64(data, size);
881
882         return journal_file_find_data_object_with_hash(f,
883                                                        data, size, hash,
884                                                        ret, offset);
885 }
886
887 static int journal_file_append_field(
888                 JournalFile *f,
889                 const void *field, uint64_t size,
890                 Object **ret, uint64_t *offset) {
891
892         uint64_t hash, p;
893         uint64_t osize;
894         Object *o;
895         int r;
896
897         assert(f);
898         assert(field && size > 0);
899
900         hash = hash64(field, size);
901
902         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
903         if (r < 0)
904                 return r;
905         else if (r > 0) {
906
907                 if (ret)
908                         *ret = o;
909
910                 if (offset)
911                         *offset = p;
912
913                 return 0;
914         }
915
916         osize = offsetof(Object, field.payload) + size;
917         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
918         if (r < 0)
919                 return r;
920
921         o->field.hash = htole64(hash);
922         memcpy(o->field.payload, field, size);
923
924         r = journal_file_link_field(f, o, p, hash);
925         if (r < 0)
926                 return r;
927
928         /* The linking might have altered the window, so let's
929          * refresh our pointer */
930         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
931         if (r < 0)
932                 return r;
933
934 #ifdef HAVE_GCRYPT
935         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
936         if (r < 0)
937                 return r;
938 #endif
939
940         if (ret)
941                 *ret = o;
942
943         if (offset)
944                 *offset = p;
945
946         return 0;
947 }
948
949 static int journal_file_append_data(
950                 JournalFile *f,
951                 const void *data, uint64_t size,
952                 Object **ret, uint64_t *offset) {
953
954         uint64_t hash, p;
955         uint64_t osize;
956         Object *o;
957         int r;
958         bool compressed = false;
959         const void *eq;
960
961         assert(f);
962         assert(data || size == 0);
963
964         hash = hash64(data, size);
965
966         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
967         if (r < 0)
968                 return r;
969         else if (r > 0) {
970
971                 if (ret)
972                         *ret = o;
973
974                 if (offset)
975                         *offset = p;
976
977                 return 0;
978         }
979
980         osize = offsetof(Object, data.payload) + size;
981         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
982         if (r < 0)
983                 return r;
984
985         o->data.hash = htole64(hash);
986
987 #ifdef HAVE_XZ
988         if (f->compress &&
989             size >= COMPRESSION_SIZE_THRESHOLD) {
990                 uint64_t rsize;
991
992                 compressed = compress_blob(data, size, o->data.payload, &rsize);
993
994                 if (compressed) {
995                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
996                         o->object.flags |= OBJECT_COMPRESSED;
997
998                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
999                 }
1000         }
1001 #endif
1002
1003         if (!compressed && size > 0)
1004                 memcpy(o->data.payload, data, size);
1005
1006         r = journal_file_link_data(f, o, p, hash);
1007         if (r < 0)
1008                 return r;
1009
1010         /* The linking might have altered the window, so let's
1011          * refresh our pointer */
1012         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1013         if (r < 0)
1014                 return r;
1015
1016         eq = memchr(data, '=', size);
1017         if (eq && eq > data) {
1018                 uint64_t fp;
1019                 Object *fo;
1020
1021                 /* Create field object ... */
1022                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1023                 if (r < 0)
1024                         return r;
1025
1026                 /* ... and link it in. */
1027                 o->data.next_field_offset = fo->field.head_data_offset;
1028                 fo->field.head_data_offset = le64toh(p);
1029         }
1030
1031 #ifdef HAVE_GCRYPT
1032         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1033         if (r < 0)
1034                 return r;
1035 #endif
1036
1037         if (ret)
1038                 *ret = o;
1039
1040         if (offset)
1041                 *offset = p;
1042
1043         return 0;
1044 }
1045
1046 uint64_t journal_file_entry_n_items(Object *o) {
1047         assert(o);
1048
1049         if (o->object.type != OBJECT_ENTRY)
1050                 return 0;
1051
1052         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1053 }
1054
1055 uint64_t journal_file_entry_array_n_items(Object *o) {
1056         assert(o);
1057
1058         if (o->object.type != OBJECT_ENTRY_ARRAY)
1059                 return 0;
1060
1061         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1062 }
1063
1064 uint64_t journal_file_hash_table_n_items(Object *o) {
1065         assert(o);
1066
1067         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068             o->object.type != OBJECT_FIELD_HASH_TABLE)
1069                 return 0;
1070
1071         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1072 }
1073
1074 static int link_entry_into_array(JournalFile *f,
1075                                  le64_t *first,
1076                                  le64_t *idx,
1077                                  uint64_t p) {
1078         int r;
1079         uint64_t n = 0, ap = 0, q, i, a, hidx;
1080         Object *o;
1081
1082         assert(f);
1083         assert(first);
1084         assert(idx);
1085         assert(p > 0);
1086
1087         a = le64toh(*first);
1088         i = hidx = le64toh(*idx);
1089         while (a > 0) {
1090
1091                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1092                 if (r < 0)
1093                         return r;
1094
1095                 n = journal_file_entry_array_n_items(o);
1096                 if (i < n) {
1097                         o->entry_array.items[i] = htole64(p);
1098                         *idx = htole64(hidx + 1);
1099                         return 0;
1100                 }
1101
1102                 i -= n;
1103                 ap = a;
1104                 a = le64toh(o->entry_array.next_entry_array_offset);
1105         }
1106
1107         if (hidx > n)
1108                 n = (hidx+1) * 2;
1109         else
1110                 n = n * 2;
1111
1112         if (n < 4)
1113                 n = 4;
1114
1115         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1117                                        &o, &q);
1118         if (r < 0)
1119                 return r;
1120
1121 #ifdef HAVE_GCRYPT
1122         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1123         if (r < 0)
1124                 return r;
1125 #endif
1126
1127         o->entry_array.items[i] = htole64(p);
1128
1129         if (ap == 0)
1130                 *first = htole64(q);
1131         else {
1132                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1133                 if (r < 0)
1134                         return r;
1135
1136                 o->entry_array.next_entry_array_offset = htole64(q);
1137         }
1138
1139         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1141
1142         *idx = htole64(hidx + 1);
1143
1144         return 0;
1145 }
1146
1147 static int link_entry_into_array_plus_one(JournalFile *f,
1148                                           le64_t *extra,
1149                                           le64_t *first,
1150                                           le64_t *idx,
1151                                           uint64_t p) {
1152
1153         int r;
1154
1155         assert(f);
1156         assert(extra);
1157         assert(first);
1158         assert(idx);
1159         assert(p > 0);
1160
1161         if (*idx == 0)
1162                 *extra = htole64(p);
1163         else {
1164                 le64_t i;
1165
1166                 i = htole64(le64toh(*idx) - 1);
1167                 r = link_entry_into_array(f, first, &i, p);
1168                 if (r < 0)
1169                         return r;
1170         }
1171
1172         *idx = htole64(le64toh(*idx) + 1);
1173         return 0;
1174 }
1175
1176 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1177         uint64_t p;
1178         int r;
1179         assert(f);
1180         assert(o);
1181         assert(offset > 0);
1182
1183         p = le64toh(o->entry.items[i].object_offset);
1184         if (p == 0)
1185                 return -EINVAL;
1186
1187         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1188         if (r < 0)
1189                 return r;
1190
1191         return link_entry_into_array_plus_one(f,
1192                                               &o->data.entry_offset,
1193                                               &o->data.entry_array_offset,
1194                                               &o->data.n_entries,
1195                                               offset);
1196 }
1197
1198 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1199         uint64_t n, i;
1200         int r;
1201
1202         assert(f);
1203         assert(o);
1204         assert(offset > 0);
1205
1206         if (o->object.type != OBJECT_ENTRY)
1207                 return -EINVAL;
1208
1209         __sync_synchronize();
1210
1211         /* Link up the entry itself */
1212         r = link_entry_into_array(f,
1213                                   &f->header->entry_array_offset,
1214                                   &f->header->n_entries,
1215                                   offset);
1216         if (r < 0)
1217                 return r;
1218
1219         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1220
1221         if (f->header->head_entry_realtime == 0)
1222                 f->header->head_entry_realtime = o->entry.realtime;
1223
1224         f->header->tail_entry_realtime = o->entry.realtime;
1225         f->header->tail_entry_monotonic = o->entry.monotonic;
1226
1227         f->tail_entry_monotonic_valid = true;
1228
1229         /* Link up the items */
1230         n = journal_file_entry_n_items(o);
1231         for (i = 0; i < n; i++) {
1232                 r = journal_file_link_entry_item(f, o, offset, i);
1233                 if (r < 0)
1234                         return r;
1235         }
1236
1237         return 0;
1238 }
1239
1240 static int journal_file_append_entry_internal(
1241                 JournalFile *f,
1242                 const dual_timestamp *ts,
1243                 uint64_t xor_hash,
1244                 const EntryItem items[], unsigned n_items,
1245                 uint64_t *seqnum,
1246                 Object **ret, uint64_t *offset) {
1247         uint64_t np;
1248         uint64_t osize;
1249         Object *o;
1250         int r;
1251
1252         assert(f);
1253         assert(items || n_items == 0);
1254         assert(ts);
1255
1256         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1257
1258         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1259         if (r < 0)
1260                 return r;
1261
1262         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1263         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1264         o->entry.realtime = htole64(ts->realtime);
1265         o->entry.monotonic = htole64(ts->monotonic);
1266         o->entry.xor_hash = htole64(xor_hash);
1267         o->entry.boot_id = f->header->boot_id;
1268
1269 #ifdef HAVE_GCRYPT
1270         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1271         if (r < 0)
1272                 return r;
1273 #endif
1274
1275         r = journal_file_link_entry(f, o, np);
1276         if (r < 0)
1277                 return r;
1278
1279         if (ret)
1280                 *ret = o;
1281
1282         if (offset)
1283                 *offset = np;
1284
1285         return 0;
1286 }
1287
1288 void journal_file_post_change(JournalFile *f) {
1289         assert(f);
1290
1291         /* inotify() does not receive IN_MODIFY events from file
1292          * accesses done via mmap(). After each access we hence
1293          * trigger IN_MODIFY by truncating the journal file to its
1294          * current size which triggers IN_MODIFY. */
1295
1296         __sync_synchronize();
1297
1298         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1299                 log_error("Failed to truncate file to its own size: %m");
1300 }
1301
1302 static int entry_item_cmp(const void *_a, const void *_b) {
1303         const EntryItem *a = _a, *b = _b;
1304
1305         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1306                 return -1;
1307         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1308                 return 1;
1309         return 0;
1310 }
1311
1312 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1313         unsigned i;
1314         EntryItem *items;
1315         int r;
1316         uint64_t xor_hash = 0;
1317         struct dual_timestamp _ts;
1318
1319         assert(f);
1320         assert(iovec || n_iovec == 0);
1321
1322         if (!ts) {
1323                 dual_timestamp_get(&_ts);
1324                 ts = &_ts;
1325         }
1326
1327         if (f->tail_entry_monotonic_valid &&
1328             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1329                 return -EINVAL;
1330
1331 #ifdef HAVE_GCRYPT
1332         r = journal_file_maybe_append_tag(f, ts->realtime);
1333         if (r < 0)
1334                 return r;
1335 #endif
1336
1337         /* alloca() can't take 0, hence let's allocate at least one */
1338         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1339
1340         for (i = 0; i < n_iovec; i++) {
1341                 uint64_t p;
1342                 Object *o;
1343
1344                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1345                 if (r < 0)
1346                         return r;
1347
1348                 xor_hash ^= le64toh(o->data.hash);
1349                 items[i].object_offset = htole64(p);
1350                 items[i].hash = o->data.hash;
1351         }
1352
1353         /* Order by the position on disk, in order to improve seek
1354          * times for rotating media. */
1355         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1356
1357         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1358
1359         journal_file_post_change(f);
1360
1361         return r;
1362 }
1363
1364 typedef struct ChainCacheItem {
1365         uint64_t first; /* the array at the begin of the chain */
1366         uint64_t array; /* the cached array */
1367         uint64_t begin; /* the first item in the cached array */
1368         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1369         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1370 } ChainCacheItem;
1371
1372 static void chain_cache_put(
1373                 Hashmap *h,
1374                 ChainCacheItem *ci,
1375                 uint64_t first,
1376                 uint64_t array,
1377                 uint64_t begin,
1378                 uint64_t total,
1379                 uint64_t last_index) {
1380
1381         if (!ci) {
1382                 /* If the chain item to cache for this chain is the
1383                  * first one it's not worth caching anything */
1384                 if (array == first)
1385                         return;
1386
1387                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1388                         ci = hashmap_steal_first(h);
1389                 else {
1390                         ci = new(ChainCacheItem, 1);
1391                         if (!ci)
1392                                 return;
1393                 }
1394
1395                 ci->first = first;
1396
1397                 if (hashmap_put(h, &ci->first, ci) < 0) {
1398                         free(ci);
1399                         return;
1400                 }
1401         } else
1402                 assert(ci->first == first);
1403
1404         ci->array = array;
1405         ci->begin = begin;
1406         ci->total = total;
1407         ci->last_index = last_index;
1408 }
1409
1410 static int generic_array_get(
1411                 JournalFile *f,
1412                 uint64_t first,
1413                 uint64_t i,
1414                 Object **ret, uint64_t *offset) {
1415
1416         Object *o;
1417         uint64_t p = 0, a, t = 0;
1418         int r;
1419         ChainCacheItem *ci;
1420
1421         assert(f);
1422
1423         a = first;
1424
1425         /* Try the chain cache first */
1426         ci = hashmap_get(f->chain_cache, &first);
1427         if (ci && i > ci->total) {
1428                 a = ci->array;
1429                 i -= ci->total;
1430                 t = ci->total;
1431         }
1432
1433         while (a > 0) {
1434                 uint64_t k;
1435
1436                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1437                 if (r < 0)
1438                         return r;
1439
1440                 k = journal_file_entry_array_n_items(o);
1441                 if (i < k) {
1442                         p = le64toh(o->entry_array.items[i]);
1443                         goto found;
1444                 }
1445
1446                 i -= k;
1447                 t += k;
1448                 a = le64toh(o->entry_array.next_entry_array_offset);
1449         }
1450
1451         return 0;
1452
1453 found:
1454         /* Let's cache this item for the next invocation */
1455         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t, i);
1456
1457         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1458         if (r < 0)
1459                 return r;
1460
1461         if (ret)
1462                 *ret = o;
1463
1464         if (offset)
1465                 *offset = p;
1466
1467         return 1;
1468 }
1469
1470 static int generic_array_get_plus_one(
1471                 JournalFile *f,
1472                 uint64_t extra,
1473                 uint64_t first,
1474                 uint64_t i,
1475                 Object **ret, uint64_t *offset) {
1476
1477         Object *o;
1478
1479         assert(f);
1480
1481         if (i == 0) {
1482                 int r;
1483
1484                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1485                 if (r < 0)
1486                         return r;
1487
1488                 if (ret)
1489                         *ret = o;
1490
1491                 if (offset)
1492                         *offset = extra;
1493
1494                 return 1;
1495         }
1496
1497         return generic_array_get(f, first, i-1, ret, offset);
1498 }
1499
1500 enum {
1501         TEST_FOUND,
1502         TEST_LEFT,
1503         TEST_RIGHT
1504 };
1505
1506 static int generic_array_bisect(
1507                 JournalFile *f,
1508                 uint64_t first,
1509                 uint64_t n,
1510                 uint64_t needle,
1511                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1512                 direction_t direction,
1513                 Object **ret,
1514                 uint64_t *offset,
1515                 uint64_t *idx) {
1516
1517         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1518         bool subtract_one = false;
1519         Object *o, *array = NULL;
1520         int r;
1521         ChainCacheItem *ci;
1522
1523         assert(f);
1524         assert(test_object);
1525
1526         /* Start with the first array in the chain */
1527         a = first;
1528
1529         ci = hashmap_get(f->chain_cache, &first);
1530         if (ci && n > ci->total) {
1531                 /* Ah, we have iterated this bisection array chain
1532                  * previously! Let's see if we can skip ahead in the
1533                  * chain, as far as the last time. But we can't jump
1534                  * backwards in the chain, so let's check that
1535                  * first. */
1536
1537                 r = test_object(f, ci->begin, needle);
1538                 if (r < 0)
1539                         return r;
1540
1541                 if (r == TEST_LEFT) {
1542                         /* OK, what we are looking for is right of the
1543                          * begin of this EntryArray, so let's jump
1544                          * straight to previously cached array in the
1545                          * chain */
1546
1547                         a = ci->array;
1548                         n -= ci->total;
1549                         t = ci->total;
1550                         last_index = ci->last_index;
1551                 }
1552         }
1553
1554         while (a > 0) {
1555                 uint64_t left, right, k, lp;
1556
1557                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1558                 if (r < 0)
1559                         return r;
1560
1561                 k = journal_file_entry_array_n_items(array);
1562                 right = MIN(k, n);
1563                 if (right <= 0)
1564                         return 0;
1565
1566                 i = right - 1;
1567                 lp = p = le64toh(array->entry_array.items[i]);
1568                 if (p <= 0)
1569                         return -EBADMSG;
1570
1571                 r = test_object(f, p, needle);
1572                 if (r < 0)
1573                         return r;
1574
1575                 if (r == TEST_FOUND)
1576                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1577
1578                 if (r == TEST_RIGHT) {
1579                         left = 0;
1580                         right -= 1;
1581
1582                         if (last_index != (uint64_t) -1) {
1583                                 assert(last_index <= right);
1584
1585                                 /* If we cached the last index we
1586                                  * looked at, let's try to not to jump
1587                                  * too wildly around and see if we can
1588                                  * limit the range to look at early to
1589                                  * the immediate neighbors of the last
1590                                  * index we looked at. */
1591
1592                                 if (last_index > 0) {
1593                                         uint64_t x = last_index - 1;
1594
1595                                         p = le64toh(array->entry_array.items[x]);
1596                                         if (p <= 0)
1597                                                 return -EBADMSG;
1598
1599                                         r = test_object(f, p, needle);
1600                                         if (r < 0)
1601                                                 return r;
1602
1603                                         if (r == TEST_FOUND)
1604                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1605
1606                                         if (r == TEST_RIGHT)
1607                                                 right = x;
1608                                         else
1609                                                 left = x + 1;
1610                                 }
1611
1612                                 if (last_index < right) {
1613                                         uint64_t y = last_index + 1;
1614
1615                                         p = le64toh(array->entry_array.items[y]);
1616                                         if (p <= 0)
1617                                                 return -EBADMSG;
1618
1619                                         r = test_object(f, p, needle);
1620                                         if (r < 0)
1621                                                 return r;
1622
1623                                         if (r == TEST_FOUND)
1624                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1625
1626                                         if (r == TEST_RIGHT)
1627                                                 right = y;
1628                                         else
1629                                                 left = y + 1;
1630                                 }
1631
1632                                 last_index = (uint64_t) -1;
1633                         }
1634
1635                         for (;;) {
1636                                 if (left == right) {
1637                                         if (direction == DIRECTION_UP)
1638                                                 subtract_one = true;
1639
1640                                         i = left;
1641                                         goto found;
1642                                 }
1643
1644                                 assert(left < right);
1645                                 i = (left + right) / 2;
1646
1647                                 p = le64toh(array->entry_array.items[i]);
1648                                 if (p <= 0)
1649                                         return -EBADMSG;
1650
1651                                 r = test_object(f, p, needle);
1652                                 if (r < 0)
1653                                         return r;
1654
1655                                 if (r == TEST_FOUND)
1656                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1657
1658                                 if (r == TEST_RIGHT)
1659                                         right = i;
1660                                 else
1661                                         left = i + 1;
1662                         }
1663                 }
1664
1665                 if (k > n) {
1666                         if (direction == DIRECTION_UP) {
1667                                 i = n;
1668                                 subtract_one = true;
1669                                 goto found;
1670                         }
1671
1672                         return 0;
1673                 }
1674
1675                 last_p = lp;
1676
1677                 n -= k;
1678                 t += k;
1679                 last_index = (uint64_t) -1;
1680                 a = le64toh(array->entry_array.next_entry_array_offset);
1681         }
1682
1683         return 0;
1684
1685 found:
1686         if (subtract_one && t == 0 && i == 0)
1687                 return 0;
1688
1689         /* Let's cache this item for the next invocation */
1690         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t, i + (subtract_one ? -1 : 0));
1691
1692         if (subtract_one && i == 0)
1693                 p = last_p;
1694         else if (subtract_one)
1695                 p = le64toh(array->entry_array.items[i-1]);
1696         else
1697                 p = le64toh(array->entry_array.items[i]);
1698
1699         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1700         if (r < 0)
1701                 return r;
1702
1703         if (ret)
1704                 *ret = o;
1705
1706         if (offset)
1707                 *offset = p;
1708
1709         if (idx)
1710                 *idx = t + i + (subtract_one ? -1 : 0);
1711
1712         return 1;
1713 }
1714
1715
1716 static int generic_array_bisect_plus_one(
1717                 JournalFile *f,
1718                 uint64_t extra,
1719                 uint64_t first,
1720                 uint64_t n,
1721                 uint64_t needle,
1722                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723                 direction_t direction,
1724                 Object **ret,
1725                 uint64_t *offset,
1726                 uint64_t *idx) {
1727
1728         int r;
1729         bool step_back = false;
1730         Object *o;
1731
1732         assert(f);
1733         assert(test_object);
1734
1735         if (n <= 0)
1736                 return 0;
1737
1738         /* This bisects the array in object 'first', but first checks
1739          * an extra  */
1740         r = test_object(f, extra, needle);
1741         if (r < 0)
1742                 return r;
1743
1744         if (r == TEST_FOUND)
1745                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1746
1747         /* if we are looking with DIRECTION_UP then we need to first
1748            see if in the actual array there is a matching entry, and
1749            return the last one of that. But if there isn't any we need
1750            to return this one. Hence remember this, and return it
1751            below. */
1752         if (r == TEST_LEFT)
1753                 step_back = direction == DIRECTION_UP;
1754
1755         if (r == TEST_RIGHT) {
1756                 if (direction == DIRECTION_DOWN)
1757                         goto found;
1758                 else
1759                         return 0;
1760         }
1761
1762         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1763
1764         if (r == 0 && step_back)
1765                 goto found;
1766
1767         if (r > 0 && idx)
1768                 (*idx) ++;
1769
1770         return r;
1771
1772 found:
1773         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1774         if (r < 0)
1775                 return r;
1776
1777         if (ret)
1778                 *ret = o;
1779
1780         if (offset)
1781                 *offset = extra;
1782
1783         if (idx)
1784                 *idx = 0;
1785
1786         return 1;
1787 }
1788
1789 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1790         assert(f);
1791         assert(p > 0);
1792
1793         if (p == needle)
1794                 return TEST_FOUND;
1795         else if (p < needle)
1796                 return TEST_LEFT;
1797         else
1798                 return TEST_RIGHT;
1799 }
1800
1801 int journal_file_move_to_entry_by_offset(
1802                 JournalFile *f,
1803                 uint64_t p,
1804                 direction_t direction,
1805                 Object **ret,
1806                 uint64_t *offset) {
1807
1808         return generic_array_bisect(f,
1809                                     le64toh(f->header->entry_array_offset),
1810                                     le64toh(f->header->n_entries),
1811                                     p,
1812                                     test_object_offset,
1813                                     direction,
1814                                     ret, offset, NULL);
1815 }
1816
1817
1818 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1819         Object *o;
1820         int r;
1821
1822         assert(f);
1823         assert(p > 0);
1824
1825         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1826         if (r < 0)
1827                 return r;
1828
1829         if (le64toh(o->entry.seqnum) == needle)
1830                 return TEST_FOUND;
1831         else if (le64toh(o->entry.seqnum) < needle)
1832                 return TEST_LEFT;
1833         else
1834                 return TEST_RIGHT;
1835 }
1836
1837 int journal_file_move_to_entry_by_seqnum(
1838                 JournalFile *f,
1839                 uint64_t seqnum,
1840                 direction_t direction,
1841                 Object **ret,
1842                 uint64_t *offset) {
1843
1844         return generic_array_bisect(f,
1845                                     le64toh(f->header->entry_array_offset),
1846                                     le64toh(f->header->n_entries),
1847                                     seqnum,
1848                                     test_object_seqnum,
1849                                     direction,
1850                                     ret, offset, NULL);
1851 }
1852
1853 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1854         Object *o;
1855         int r;
1856
1857         assert(f);
1858         assert(p > 0);
1859
1860         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1861         if (r < 0)
1862                 return r;
1863
1864         if (le64toh(o->entry.realtime) == needle)
1865                 return TEST_FOUND;
1866         else if (le64toh(o->entry.realtime) < needle)
1867                 return TEST_LEFT;
1868         else
1869                 return TEST_RIGHT;
1870 }
1871
1872 int journal_file_move_to_entry_by_realtime(
1873                 JournalFile *f,
1874                 uint64_t realtime,
1875                 direction_t direction,
1876                 Object **ret,
1877                 uint64_t *offset) {
1878
1879         return generic_array_bisect(f,
1880                                     le64toh(f->header->entry_array_offset),
1881                                     le64toh(f->header->n_entries),
1882                                     realtime,
1883                                     test_object_realtime,
1884                                     direction,
1885                                     ret, offset, NULL);
1886 }
1887
1888 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1889         Object *o;
1890         int r;
1891
1892         assert(f);
1893         assert(p > 0);
1894
1895         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1896         if (r < 0)
1897                 return r;
1898
1899         if (le64toh(o->entry.monotonic) == needle)
1900                 return TEST_FOUND;
1901         else if (le64toh(o->entry.monotonic) < needle)
1902                 return TEST_LEFT;
1903         else
1904                 return TEST_RIGHT;
1905 }
1906
1907 static inline int find_data_object_by_boot_id(
1908                 JournalFile *f,
1909                 sd_id128_t boot_id,
1910                 Object **o,
1911                 uint64_t *b) {
1912         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1913
1914         sd_id128_to_string(boot_id, t + 9);
1915         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1916 }
1917
1918 int journal_file_move_to_entry_by_monotonic(
1919                 JournalFile *f,
1920                 sd_id128_t boot_id,
1921                 uint64_t monotonic,
1922                 direction_t direction,
1923                 Object **ret,
1924                 uint64_t *offset) {
1925
1926         Object *o;
1927         int r;
1928
1929         assert(f);
1930
1931         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1932         if (r < 0)
1933                 return r;
1934         if (r == 0)
1935                 return -ENOENT;
1936
1937         return generic_array_bisect_plus_one(f,
1938                                              le64toh(o->data.entry_offset),
1939                                              le64toh(o->data.entry_array_offset),
1940                                              le64toh(o->data.n_entries),
1941                                              monotonic,
1942                                              test_object_monotonic,
1943                                              direction,
1944                                              ret, offset, NULL);
1945 }
1946
1947 int journal_file_next_entry(
1948                 JournalFile *f,
1949                 Object *o, uint64_t p,
1950                 direction_t direction,
1951                 Object **ret, uint64_t *offset) {
1952
1953         uint64_t i, n;
1954         int r;
1955
1956         assert(f);
1957         assert(p > 0 || !o);
1958
1959         n = le64toh(f->header->n_entries);
1960         if (n <= 0)
1961                 return 0;
1962
1963         if (!o)
1964                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1965         else {
1966                 if (o->object.type != OBJECT_ENTRY)
1967                         return -EINVAL;
1968
1969                 r = generic_array_bisect(f,
1970                                          le64toh(f->header->entry_array_offset),
1971                                          le64toh(f->header->n_entries),
1972                                          p,
1973                                          test_object_offset,
1974                                          DIRECTION_DOWN,
1975                                          NULL, NULL,
1976                                          &i);
1977                 if (r <= 0)
1978                         return r;
1979
1980                 if (direction == DIRECTION_DOWN) {
1981                         if (i >= n - 1)
1982                                 return 0;
1983
1984                         i++;
1985                 } else {
1986                         if (i <= 0)
1987                                 return 0;
1988
1989                         i--;
1990                 }
1991         }
1992
1993         /* And jump to it */
1994         return generic_array_get(f,
1995                                  le64toh(f->header->entry_array_offset),
1996                                  i,
1997                                  ret, offset);
1998 }
1999
2000 int journal_file_skip_entry(
2001                 JournalFile *f,
2002                 Object *o, uint64_t p,
2003                 int64_t skip,
2004                 Object **ret, uint64_t *offset) {
2005
2006         uint64_t i, n;
2007         int r;
2008
2009         assert(f);
2010         assert(o);
2011         assert(p > 0);
2012
2013         if (o->object.type != OBJECT_ENTRY)
2014                 return -EINVAL;
2015
2016         r = generic_array_bisect(f,
2017                                  le64toh(f->header->entry_array_offset),
2018                                  le64toh(f->header->n_entries),
2019                                  p,
2020                                  test_object_offset,
2021                                  DIRECTION_DOWN,
2022                                  NULL, NULL,
2023                                  &i);
2024         if (r <= 0)
2025                 return r;
2026
2027         /* Calculate new index */
2028         if (skip < 0) {
2029                 if ((uint64_t) -skip >= i)
2030                         i = 0;
2031                 else
2032                         i = i - (uint64_t) -skip;
2033         } else
2034                 i  += (uint64_t) skip;
2035
2036         n = le64toh(f->header->n_entries);
2037         if (n <= 0)
2038                 return -EBADMSG;
2039
2040         if (i >= n)
2041                 i = n-1;
2042
2043         return generic_array_get(f,
2044                                  le64toh(f->header->entry_array_offset),
2045                                  i,
2046                                  ret, offset);
2047 }
2048
2049 int journal_file_next_entry_for_data(
2050                 JournalFile *f,
2051                 Object *o, uint64_t p,
2052                 uint64_t data_offset,
2053                 direction_t direction,
2054                 Object **ret, uint64_t *offset) {
2055
2056         uint64_t n, i;
2057         int r;
2058         Object *d;
2059
2060         assert(f);
2061         assert(p > 0 || !o);
2062
2063         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2064         if (r < 0)
2065                 return r;
2066
2067         n = le64toh(d->data.n_entries);
2068         if (n <= 0)
2069                 return n;
2070
2071         if (!o)
2072                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2073         else {
2074                 if (o->object.type != OBJECT_ENTRY)
2075                         return -EINVAL;
2076
2077                 r = generic_array_bisect_plus_one(f,
2078                                                   le64toh(d->data.entry_offset),
2079                                                   le64toh(d->data.entry_array_offset),
2080                                                   le64toh(d->data.n_entries),
2081                                                   p,
2082                                                   test_object_offset,
2083                                                   DIRECTION_DOWN,
2084                                                   NULL, NULL,
2085                                                   &i);
2086
2087                 if (r <= 0)
2088                         return r;
2089
2090                 if (direction == DIRECTION_DOWN) {
2091                         if (i >= n - 1)
2092                                 return 0;
2093
2094                         i++;
2095                 } else {
2096                         if (i <= 0)
2097                                 return 0;
2098
2099                         i--;
2100                 }
2101
2102         }
2103
2104         return generic_array_get_plus_one(f,
2105                                           le64toh(d->data.entry_offset),
2106                                           le64toh(d->data.entry_array_offset),
2107                                           i,
2108                                           ret, offset);
2109 }
2110
2111 int journal_file_move_to_entry_by_offset_for_data(
2112                 JournalFile *f,
2113                 uint64_t data_offset,
2114                 uint64_t p,
2115                 direction_t direction,
2116                 Object **ret, uint64_t *offset) {
2117
2118         int r;
2119         Object *d;
2120
2121         assert(f);
2122
2123         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2124         if (r < 0)
2125                 return r;
2126
2127         return generic_array_bisect_plus_one(f,
2128                                              le64toh(d->data.entry_offset),
2129                                              le64toh(d->data.entry_array_offset),
2130                                              le64toh(d->data.n_entries),
2131                                              p,
2132                                              test_object_offset,
2133                                              direction,
2134                                              ret, offset, NULL);
2135 }
2136
2137 int journal_file_move_to_entry_by_monotonic_for_data(
2138                 JournalFile *f,
2139                 uint64_t data_offset,
2140                 sd_id128_t boot_id,
2141                 uint64_t monotonic,
2142                 direction_t direction,
2143                 Object **ret, uint64_t *offset) {
2144
2145         Object *o, *d;
2146         int r;
2147         uint64_t b, z;
2148
2149         assert(f);
2150
2151         /* First, seek by time */
2152         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2153         if (r < 0)
2154                 return r;
2155         if (r == 0)
2156                 return -ENOENT;
2157
2158         r = generic_array_bisect_plus_one(f,
2159                                           le64toh(o->data.entry_offset),
2160                                           le64toh(o->data.entry_array_offset),
2161                                           le64toh(o->data.n_entries),
2162                                           monotonic,
2163                                           test_object_monotonic,
2164                                           direction,
2165                                           NULL, &z, NULL);
2166         if (r <= 0)
2167                 return r;
2168
2169         /* And now, continue seeking until we find an entry that
2170          * exists in both bisection arrays */
2171
2172         for (;;) {
2173                 Object *qo;
2174                 uint64_t p, q;
2175
2176                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2177                 if (r < 0)
2178                         return r;
2179
2180                 r = generic_array_bisect_plus_one(f,
2181                                                   le64toh(d->data.entry_offset),
2182                                                   le64toh(d->data.entry_array_offset),
2183                                                   le64toh(d->data.n_entries),
2184                                                   z,
2185                                                   test_object_offset,
2186                                                   direction,
2187                                                   NULL, &p, NULL);
2188                 if (r <= 0)
2189                         return r;
2190
2191                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2192                 if (r < 0)
2193                         return r;
2194
2195                 r = generic_array_bisect_plus_one(f,
2196                                                   le64toh(o->data.entry_offset),
2197                                                   le64toh(o->data.entry_array_offset),
2198                                                   le64toh(o->data.n_entries),
2199                                                   p,
2200                                                   test_object_offset,
2201                                                   direction,
2202                                                   &qo, &q, NULL);
2203
2204                 if (r <= 0)
2205                         return r;
2206
2207                 if (p == q) {
2208                         if (ret)
2209                                 *ret = qo;
2210                         if (offset)
2211                                 *offset = q;
2212
2213                         return 1;
2214                 }
2215
2216                 z = q;
2217         }
2218
2219         return 0;
2220 }
2221
2222 int journal_file_move_to_entry_by_seqnum_for_data(
2223                 JournalFile *f,
2224                 uint64_t data_offset,
2225                 uint64_t seqnum,
2226                 direction_t direction,
2227                 Object **ret, uint64_t *offset) {
2228
2229         Object *d;
2230         int r;
2231
2232         assert(f);
2233
2234         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2235         if (r < 0)
2236                 return r;
2237
2238         return generic_array_bisect_plus_one(f,
2239                                              le64toh(d->data.entry_offset),
2240                                              le64toh(d->data.entry_array_offset),
2241                                              le64toh(d->data.n_entries),
2242                                              seqnum,
2243                                              test_object_seqnum,
2244                                              direction,
2245                                              ret, offset, NULL);
2246 }
2247
2248 int journal_file_move_to_entry_by_realtime_for_data(
2249                 JournalFile *f,
2250                 uint64_t data_offset,
2251                 uint64_t realtime,
2252                 direction_t direction,
2253                 Object **ret, uint64_t *offset) {
2254
2255         Object *d;
2256         int r;
2257
2258         assert(f);
2259
2260         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2261         if (r < 0)
2262                 return r;
2263
2264         return generic_array_bisect_plus_one(f,
2265                                              le64toh(d->data.entry_offset),
2266                                              le64toh(d->data.entry_array_offset),
2267                                              le64toh(d->data.n_entries),
2268                                              realtime,
2269                                              test_object_realtime,
2270                                              direction,
2271                                              ret, offset, NULL);
2272 }
2273
2274 void journal_file_dump(JournalFile *f) {
2275         Object *o;
2276         int r;
2277         uint64_t p;
2278
2279         assert(f);
2280
2281         journal_file_print_header(f);
2282
2283         p = le64toh(f->header->header_size);
2284         while (p != 0) {
2285                 r = journal_file_move_to_object(f, -1, p, &o);
2286                 if (r < 0)
2287                         goto fail;
2288
2289                 switch (o->object.type) {
2290
2291                 case OBJECT_UNUSED:
2292                         printf("Type: OBJECT_UNUSED\n");
2293                         break;
2294
2295                 case OBJECT_DATA:
2296                         printf("Type: OBJECT_DATA\n");
2297                         break;
2298
2299                 case OBJECT_FIELD:
2300                         printf("Type: OBJECT_FIELD\n");
2301                         break;
2302
2303                 case OBJECT_ENTRY:
2304                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2305                                le64toh(o->entry.seqnum),
2306                                le64toh(o->entry.monotonic),
2307                                le64toh(o->entry.realtime));
2308                         break;
2309
2310                 case OBJECT_FIELD_HASH_TABLE:
2311                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2312                         break;
2313
2314                 case OBJECT_DATA_HASH_TABLE:
2315                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2316                         break;
2317
2318                 case OBJECT_ENTRY_ARRAY:
2319                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2320                         break;
2321
2322                 case OBJECT_TAG:
2323                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2324                                le64toh(o->tag.seqnum),
2325                                le64toh(o->tag.epoch));
2326                         break;
2327
2328                 default:
2329                         printf("Type: unknown (%u)\n", o->object.type);
2330                         break;
2331                 }
2332
2333                 if (o->object.flags & OBJECT_COMPRESSED)
2334                         printf("Flags: COMPRESSED\n");
2335
2336                 if (p == le64toh(f->header->tail_object_offset))
2337                         p = 0;
2338                 else
2339                         p = p + ALIGN64(le64toh(o->object.size));
2340         }
2341
2342         return;
2343 fail:
2344         log_error("File corrupt");
2345 }
2346
2347 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2348         const char *x;
2349
2350         x = format_timestamp(buf, l, t);
2351         if (x)
2352                 return x;
2353         return " --- ";
2354 }
2355
2356 void journal_file_print_header(JournalFile *f) {
2357         char a[33], b[33], c[33], d[33];
2358         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2359         struct stat st;
2360         char bytes[FORMAT_BYTES_MAX];
2361
2362         assert(f);
2363
2364         printf("File Path: %s\n"
2365                "File ID: %s\n"
2366                "Machine ID: %s\n"
2367                "Boot ID: %s\n"
2368                "Sequential Number ID: %s\n"
2369                "State: %s\n"
2370                "Compatible Flags:%s%s\n"
2371                "Incompatible Flags:%s%s\n"
2372                "Header size: %"PRIu64"\n"
2373                "Arena size: %"PRIu64"\n"
2374                "Data Hash Table Size: %"PRIu64"\n"
2375                "Field Hash Table Size: %"PRIu64"\n"
2376                "Rotate Suggested: %s\n"
2377                "Head Sequential Number: %"PRIu64"\n"
2378                "Tail Sequential Number: %"PRIu64"\n"
2379                "Head Realtime Timestamp: %s\n"
2380                "Tail Realtime Timestamp: %s\n"
2381                "Tail Monotonic Timestamp: %s\n"
2382                "Objects: %"PRIu64"\n"
2383                "Entry Objects: %"PRIu64"\n",
2384                f->path,
2385                sd_id128_to_string(f->header->file_id, a),
2386                sd_id128_to_string(f->header->machine_id, b),
2387                sd_id128_to_string(f->header->boot_id, c),
2388                sd_id128_to_string(f->header->seqnum_id, d),
2389                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2390                f->header->state == STATE_ONLINE ? "ONLINE" :
2391                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2392                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2393                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2394                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2395                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2396                le64toh(f->header->header_size),
2397                le64toh(f->header->arena_size),
2398                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2399                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2400                yes_no(journal_file_rotate_suggested(f, 0)),
2401                le64toh(f->header->head_entry_seqnum),
2402                le64toh(f->header->tail_entry_seqnum),
2403                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2404                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2405                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2406                le64toh(f->header->n_objects),
2407                le64toh(f->header->n_entries));
2408
2409         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2410                 printf("Data Objects: %"PRIu64"\n"
2411                        "Data Hash Table Fill: %.1f%%\n",
2412                        le64toh(f->header->n_data),
2413                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2414
2415         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2416                 printf("Field Objects: %"PRIu64"\n"
2417                        "Field Hash Table Fill: %.1f%%\n",
2418                        le64toh(f->header->n_fields),
2419                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2420
2421         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2422                 printf("Tag Objects: %"PRIu64"\n",
2423                        le64toh(f->header->n_tags));
2424         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2425                 printf("Entry Array Objects: %"PRIu64"\n",
2426                        le64toh(f->header->n_entry_arrays));
2427
2428         if (fstat(f->fd, &st) >= 0)
2429                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2430 }
2431
2432 int journal_file_open(
2433                 const char *fname,
2434                 int flags,
2435                 mode_t mode,
2436                 bool compress,
2437                 bool seal,
2438                 JournalMetrics *metrics,
2439                 MMapCache *mmap_cache,
2440                 JournalFile *template,
2441                 JournalFile **ret) {
2442
2443         JournalFile *f;
2444         int r;
2445         bool newly_created = false;
2446
2447         assert(fname);
2448         assert(ret);
2449
2450         if ((flags & O_ACCMODE) != O_RDONLY &&
2451             (flags & O_ACCMODE) != O_RDWR)
2452                 return -EINVAL;
2453
2454         if (!endswith(fname, ".journal") &&
2455             !endswith(fname, ".journal~"))
2456                 return -EINVAL;
2457
2458         f = new0(JournalFile, 1);
2459         if (!f)
2460                 return -ENOMEM;
2461
2462         f->fd = -1;
2463         f->mode = mode;
2464
2465         f->flags = flags;
2466         f->prot = prot_from_flags(flags);
2467         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2468 #ifdef HAVE_XZ
2469         f->compress = compress;
2470 #endif
2471 #ifdef HAVE_GCRYPT
2472         f->seal = seal;
2473 #endif
2474
2475         if (mmap_cache)
2476                 f->mmap = mmap_cache_ref(mmap_cache);
2477         else {
2478                 f->mmap = mmap_cache_new();
2479                 if (!f->mmap) {
2480                         r = -ENOMEM;
2481                         goto fail;
2482                 }
2483         }
2484
2485         f->path = strdup(fname);
2486         if (!f->path) {
2487                 r = -ENOMEM;
2488                 goto fail;
2489         }
2490
2491         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2492         if (!f->chain_cache) {
2493                 r = -ENOMEM;
2494                 goto fail;
2495         }
2496
2497         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2498         if (f->fd < 0) {
2499                 r = -errno;
2500                 goto fail;
2501         }
2502
2503         if (fstat(f->fd, &f->last_stat) < 0) {
2504                 r = -errno;
2505                 goto fail;
2506         }
2507
2508         if (f->last_stat.st_size == 0 && f->writable) {
2509 #ifdef HAVE_XATTR
2510                 uint64_t crtime;
2511
2512                 /* Let's attach the creation time to the journal file,
2513                  * so that the vacuuming code knows the age of this
2514                  * file even if the file might end up corrupted one
2515                  * day... Ideally we'd just use the creation time many
2516                  * file systems maintain for each file, but there is
2517                  * currently no usable API to query this, hence let's
2518                  * emulate this via extended attributes. If extended
2519                  * attributes are not supported we'll just skip this,
2520                  * and rely solely on mtime/atime/ctime of the file.*/
2521
2522                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2523                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2524 #endif
2525
2526 #ifdef HAVE_GCRYPT
2527                 /* Try to load the FSPRG state, and if we can't, then
2528                  * just don't do sealing */
2529                 if (f->seal) {
2530                         r = journal_file_fss_load(f);
2531                         if (r < 0)
2532                                 f->seal = false;
2533                 }
2534 #endif
2535
2536                 r = journal_file_init_header(f, template);
2537                 if (r < 0)
2538                         goto fail;
2539
2540                 if (fstat(f->fd, &f->last_stat) < 0) {
2541                         r = -errno;
2542                         goto fail;
2543                 }
2544
2545                 newly_created = true;
2546         }
2547
2548         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2549                 r = -EIO;
2550                 goto fail;
2551         }
2552
2553         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2554         if (f->header == MAP_FAILED) {
2555                 f->header = NULL;
2556                 r = -errno;
2557                 goto fail;
2558         }
2559
2560         if (!newly_created) {
2561                 r = journal_file_verify_header(f);
2562                 if (r < 0)
2563                         goto fail;
2564         }
2565
2566 #ifdef HAVE_GCRYPT
2567         if (!newly_created && f->writable) {
2568                 r = journal_file_fss_load(f);
2569                 if (r < 0)
2570                         goto fail;
2571         }
2572 #endif
2573
2574         if (f->writable) {
2575                 if (metrics) {
2576                         journal_default_metrics(metrics, f->fd);
2577                         f->metrics = *metrics;
2578                 } else if (template)
2579                         f->metrics = template->metrics;
2580
2581                 r = journal_file_refresh_header(f);
2582                 if (r < 0)
2583                         goto fail;
2584         }
2585
2586 #ifdef HAVE_GCRYPT
2587         r = journal_file_hmac_setup(f);
2588         if (r < 0)
2589                 goto fail;
2590 #endif
2591
2592         if (newly_created) {
2593                 r = journal_file_setup_field_hash_table(f);
2594                 if (r < 0)
2595                         goto fail;
2596
2597                 r = journal_file_setup_data_hash_table(f);
2598                 if (r < 0)
2599                         goto fail;
2600
2601 #ifdef HAVE_GCRYPT
2602                 r = journal_file_append_first_tag(f);
2603                 if (r < 0)
2604                         goto fail;
2605 #endif
2606         }
2607
2608         r = journal_file_map_field_hash_table(f);
2609         if (r < 0)
2610                 goto fail;
2611
2612         r = journal_file_map_data_hash_table(f);
2613         if (r < 0)
2614                 goto fail;
2615
2616         *ret = f;
2617         return 0;
2618
2619 fail:
2620         journal_file_close(f);
2621
2622         return r;
2623 }
2624
2625 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2626         _cleanup_free_ char *p = NULL;
2627         size_t l;
2628         JournalFile *old_file, *new_file = NULL;
2629         int r;
2630
2631         assert(f);
2632         assert(*f);
2633
2634         old_file = *f;
2635
2636         if (!old_file->writable)
2637                 return -EINVAL;
2638
2639         if (!endswith(old_file->path, ".journal"))
2640                 return -EINVAL;
2641
2642         l = strlen(old_file->path);
2643         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2644                      (int) l - 8, old_file->path,
2645                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2646                      le64toh((*f)->header->head_entry_seqnum),
2647                      le64toh((*f)->header->head_entry_realtime));
2648         if (r < 0)
2649                 return -ENOMEM;
2650
2651         r = rename(old_file->path, p);
2652         if (r < 0)
2653                 return -errno;
2654
2655         old_file->header->state = STATE_ARCHIVED;
2656
2657         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2658         journal_file_close(old_file);
2659
2660         *f = new_file;
2661         return r;
2662 }
2663
2664 int journal_file_open_reliably(
2665                 const char *fname,
2666                 int flags,
2667                 mode_t mode,
2668                 bool compress,
2669                 bool seal,
2670                 JournalMetrics *metrics,
2671                 MMapCache *mmap_cache,
2672                 JournalFile *template,
2673                 JournalFile **ret) {
2674
2675         int r;
2676         size_t l;
2677         _cleanup_free_ char *p = NULL;
2678
2679         r = journal_file_open(fname, flags, mode, compress, seal,
2680                               metrics, mmap_cache, template, ret);
2681         if (r != -EBADMSG && /* corrupted */
2682             r != -ENODATA && /* truncated */
2683             r != -EHOSTDOWN && /* other machine */
2684             r != -EPROTONOSUPPORT && /* incompatible feature */
2685             r != -EBUSY && /* unclean shutdown */
2686             r != -ESHUTDOWN /* already archived */)
2687                 return r;
2688
2689         if ((flags & O_ACCMODE) == O_RDONLY)
2690                 return r;
2691
2692         if (!(flags & O_CREAT))
2693                 return r;
2694
2695         if (!endswith(fname, ".journal"))
2696                 return r;
2697
2698         /* The file is corrupted. Rotate it away and try it again (but only once) */
2699
2700         l = strlen(fname);
2701         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2702                      (int) l - 8, fname,
2703                      (unsigned long long) now(CLOCK_REALTIME),
2704                      random_ull()) < 0)
2705                 return -ENOMEM;
2706
2707         r = rename(fname, p);
2708         if (r < 0)
2709                 return -errno;
2710
2711         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2712
2713         return journal_file_open(fname, flags, mode, compress, seal,
2714                                  metrics, mmap_cache, template, ret);
2715 }
2716
2717 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2718         uint64_t i, n;
2719         uint64_t q, xor_hash = 0;
2720         int r;
2721         EntryItem *items;
2722         dual_timestamp ts;
2723
2724         assert(from);
2725         assert(to);
2726         assert(o);
2727         assert(p);
2728
2729         if (!to->writable)
2730                 return -EPERM;
2731
2732         ts.monotonic = le64toh(o->entry.monotonic);
2733         ts.realtime = le64toh(o->entry.realtime);
2734
2735         if (to->tail_entry_monotonic_valid &&
2736             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2737                 return -EINVAL;
2738
2739         n = journal_file_entry_n_items(o);
2740         items = alloca(sizeof(EntryItem) * n);
2741
2742         for (i = 0; i < n; i++) {
2743                 uint64_t l, h;
2744                 le64_t le_hash;
2745                 size_t t;
2746                 void *data;
2747                 Object *u;
2748
2749                 q = le64toh(o->entry.items[i].object_offset);
2750                 le_hash = o->entry.items[i].hash;
2751
2752                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2753                 if (r < 0)
2754                         return r;
2755
2756                 if (le_hash != o->data.hash)
2757                         return -EBADMSG;
2758
2759                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2760                 t = (size_t) l;
2761
2762                 /* We hit the limit on 32bit machines */
2763                 if ((uint64_t) t != l)
2764                         return -E2BIG;
2765
2766                 if (o->object.flags & OBJECT_COMPRESSED) {
2767 #ifdef HAVE_XZ
2768                         uint64_t rsize;
2769
2770                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2771                                 return -EBADMSG;
2772
2773                         data = from->compress_buffer;
2774                         l = rsize;
2775 #else
2776                         return -EPROTONOSUPPORT;
2777 #endif
2778                 } else
2779                         data = o->data.payload;
2780
2781                 r = journal_file_append_data(to, data, l, &u, &h);
2782                 if (r < 0)
2783                         return r;
2784
2785                 xor_hash ^= le64toh(u->data.hash);
2786                 items[i].object_offset = htole64(h);
2787                 items[i].hash = u->data.hash;
2788
2789                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2790                 if (r < 0)
2791                         return r;
2792         }
2793
2794         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2795 }
2796
2797 void journal_default_metrics(JournalMetrics *m, int fd) {
2798         uint64_t fs_size = 0;
2799         struct statvfs ss;
2800         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2801
2802         assert(m);
2803         assert(fd >= 0);
2804
2805         if (fstatvfs(fd, &ss) >= 0)
2806                 fs_size = ss.f_frsize * ss.f_blocks;
2807
2808         if (m->max_use == (uint64_t) -1) {
2809
2810                 if (fs_size > 0) {
2811                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2812
2813                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2814                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2815
2816                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2817                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2818                 } else
2819                         m->max_use = DEFAULT_MAX_USE_LOWER;
2820         } else {
2821                 m->max_use = PAGE_ALIGN(m->max_use);
2822
2823                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2824                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2825         }
2826
2827         if (m->max_size == (uint64_t) -1) {
2828                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2829
2830                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2831                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2832         } else
2833                 m->max_size = PAGE_ALIGN(m->max_size);
2834
2835         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2836                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2837
2838         if (m->max_size*2 > m->max_use)
2839                 m->max_use = m->max_size*2;
2840
2841         if (m->min_size == (uint64_t) -1)
2842                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2843         else {
2844                 m->min_size = PAGE_ALIGN(m->min_size);
2845
2846                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2847                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2848
2849                 if (m->min_size > m->max_size)
2850                         m->max_size = m->min_size;
2851         }
2852
2853         if (m->keep_free == (uint64_t) -1) {
2854
2855                 if (fs_size > 0) {
2856                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2857
2858                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2859                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2860
2861                 } else
2862                         m->keep_free = DEFAULT_KEEP_FREE;
2863         }
2864
2865         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2866                   format_bytes(a, sizeof(a), m->max_use),
2867                   format_bytes(b, sizeof(b), m->max_size),
2868                   format_bytes(c, sizeof(c), m->min_size),
2869                   format_bytes(d, sizeof(d), m->keep_free));
2870 }
2871
2872 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2873         assert(f);
2874         assert(from || to);
2875
2876         if (from) {
2877                 if (f->header->head_entry_realtime == 0)
2878                         return -ENOENT;
2879
2880                 *from = le64toh(f->header->head_entry_realtime);
2881         }
2882
2883         if (to) {
2884                 if (f->header->tail_entry_realtime == 0)
2885                         return -ENOENT;
2886
2887                 *to = le64toh(f->header->tail_entry_realtime);
2888         }
2889
2890         return 1;
2891 }
2892
2893 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2894         Object *o;
2895         uint64_t p;
2896         int r;
2897
2898         assert(f);
2899         assert(from || to);
2900
2901         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2902         if (r <= 0)
2903                 return r;
2904
2905         if (le64toh(o->data.n_entries) <= 0)
2906                 return 0;
2907
2908         if (from) {
2909                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2910                 if (r < 0)
2911                         return r;
2912
2913                 *from = le64toh(o->entry.monotonic);
2914         }
2915
2916         if (to) {
2917                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2918                 if (r < 0)
2919                         return r;
2920
2921                 r = generic_array_get_plus_one(f,
2922                                                le64toh(o->data.entry_offset),
2923                                                le64toh(o->data.entry_array_offset),
2924                                                le64toh(o->data.n_entries)-1,
2925                                                &o, NULL);
2926                 if (r <= 0)
2927                         return r;
2928
2929                 *to = le64toh(o->entry.monotonic);
2930         }
2931
2932         return 1;
2933 }
2934
2935 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2936         assert(f);
2937
2938         /* If we gained new header fields we gained new features,
2939          * hence suggest a rotation */
2940         if (le64toh(f->header->header_size) < sizeof(Header)) {
2941                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2942                 return true;
2943         }
2944
2945         /* Let's check if the hash tables grew over a certain fill
2946          * level (75%, borrowing this value from Java's hash table
2947          * implementation), and if so suggest a rotation. To calculate
2948          * the fill level we need the n_data field, which only exists
2949          * in newer versions. */
2950
2951         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2952                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2953                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2954                                   f->path,
2955                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2956                                   le64toh(f->header->n_data),
2957                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2958                                   (unsigned long long) f->last_stat.st_size,
2959                                   f->last_stat.st_size / le64toh(f->header->n_data));
2960                         return true;
2961                 }
2962
2963         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2964                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2965                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2966                                   f->path,
2967                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2968                                   le64toh(f->header->n_fields),
2969                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2970                         return true;
2971                 }
2972
2973         /* Are the data objects properly indexed by field objects? */
2974         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2975             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2976             le64toh(f->header->n_data) > 0 &&
2977             le64toh(f->header->n_fields) == 0)
2978                 return true;
2979
2980         if (max_file_usec > 0) {
2981                 usec_t t, h;
2982
2983                 h = le64toh(f->header->head_entry_realtime);
2984                 t = now(CLOCK_REALTIME);
2985
2986                 if (h > 0 && t > h + max_file_usec)
2987                         return true;
2988         }
2989
2990         return false;
2991 }