chiark / gitweb /
journal: assume that next entry is after previous entry
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75         assert(f);
76
77         if (!f->writable)
78                 return -EPERM;
79
80         if (!(f->fd >= 0 && f->header))
81                 return -EINVAL;
82
83         switch(f->header->state) {
84                 case STATE_ONLINE:
85                         return 0;
86
87                 case STATE_OFFLINE:
88                         f->header->state = STATE_ONLINE;
89                         fsync(f->fd);
90                         return 0;
91
92                 default:
93                         return -EINVAL;
94         }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98         assert(f);
99
100         if (!f->writable)
101                 return -EPERM;
102
103         if (!(f->fd >= 0 && f->header))
104                 return -EINVAL;
105
106         if (f->header->state != STATE_ONLINE)
107                 return 0;
108
109         fsync(f->fd);
110
111         f->header->state = STATE_OFFLINE;
112
113         fsync(f->fd);
114
115         return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119         assert(f);
120
121 #ifdef HAVE_GCRYPT
122         /* Write the final tag */
123         if (f->seal && f->writable)
124                 journal_file_append_tag(f);
125 #endif
126
127         /* Sync everything to disk, before we mark the file offline */
128         if (f->mmap && f->fd >= 0)
129                 mmap_cache_close_fd(f->mmap, f->fd);
130
131         journal_file_set_offline(f);
132
133         if (f->header)
134                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136         if (f->fd >= 0)
137                 close_nointr_nofail(f->fd);
138
139         free(f->path);
140
141         if (f->mmap)
142                 mmap_cache_unref(f->mmap);
143
144         hashmap_free_free(f->chain_cache);
145
146 #ifdef HAVE_XZ
147         free(f->compress_buffer);
148 #endif
149
150 #ifdef HAVE_GCRYPT
151         if (f->fss_file)
152                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153         else if (f->fsprg_state)
154                 free(f->fsprg_state);
155
156         free(f->fsprg_seed);
157
158         if (f->hmac)
159                 gcry_md_close(f->hmac);
160 #endif
161
162         free(f);
163 }
164
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166         Header h;
167         ssize_t k;
168         int r;
169
170         assert(f);
171
172         zero(h);
173         memcpy(h.signature, HEADER_SIGNATURE, 8);
174         h.header_size = htole64(ALIGN64(sizeof(h)));
175
176         h.incompatible_flags =
177                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179         h.compatible_flags =
180                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
181
182         r = sd_id128_randomize(&h.file_id);
183         if (r < 0)
184                 return r;
185
186         if (template) {
187                 h.seqnum_id = template->header->seqnum_id;
188                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
189         } else
190                 h.seqnum_id = h.file_id;
191
192         k = pwrite(f->fd, &h, sizeof(h), 0);
193         if (k < 0)
194                 return -errno;
195
196         if (k != sizeof(h))
197                 return -EIO;
198
199         return 0;
200 }
201
202 static int journal_file_refresh_header(JournalFile *f) {
203         int r;
204         sd_id128_t boot_id;
205
206         assert(f);
207
208         r = sd_id128_get_machine(&f->header->machine_id);
209         if (r < 0)
210                 return r;
211
212         r = sd_id128_get_boot(&boot_id);
213         if (r < 0)
214                 return r;
215
216         if (sd_id128_equal(boot_id, f->header->boot_id))
217                 f->tail_entry_monotonic_valid = true;
218
219         f->header->boot_id = boot_id;
220
221         journal_file_set_online(f);
222
223         /* Sync the online state to disk */
224         fsync(f->fd);
225
226         return 0;
227 }
228
229 static int journal_file_verify_header(JournalFile *f) {
230         assert(f);
231
232         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233                 return -EBADMSG;
234
235         /* In both read and write mode we refuse to open files with
236          * incompatible flags we don't know */
237 #ifdef HAVE_XZ
238         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239                 return -EPROTONOSUPPORT;
240 #else
241         if (f->header->incompatible_flags != 0)
242                 return -EPROTONOSUPPORT;
243 #endif
244
245         /* When open for writing we refuse to open files with
246          * compatible flags, too */
247         if (f->writable) {
248 #ifdef HAVE_GCRYPT
249                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250                         return -EPROTONOSUPPORT;
251 #else
252                 if (f->header->compatible_flags != 0)
253                         return -EPROTONOSUPPORT;
254 #endif
255         }
256
257         if (f->header->state >= _STATE_MAX)
258                 return -EBADMSG;
259
260         /* The first addition was n_data, so check that we are at least this large */
261         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262                 return -EBADMSG;
263
264         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265                 return -EBADMSG;
266
267         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268                 return -ENODATA;
269
270         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271                 return -ENODATA;
272
273         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275             !VALID64(le64toh(f->header->tail_object_offset)) ||
276             !VALID64(le64toh(f->header->entry_array_offset)))
277                 return -ENODATA;
278
279         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
283                 return -ENODATA;
284
285         if (f->writable) {
286                 uint8_t state;
287                 sd_id128_t machine_id;
288                 int r;
289
290                 r = sd_id128_get_machine(&machine_id);
291                 if (r < 0)
292                         return r;
293
294                 if (!sd_id128_equal(machine_id, f->header->machine_id))
295                         return -EHOSTDOWN;
296
297                 state = f->header->state;
298
299                 if (state == STATE_ONLINE) {
300                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301                         return -EBUSY;
302                 } else if (state == STATE_ARCHIVED)
303                         return -ESHUTDOWN;
304                 else if (state != STATE_OFFLINE) {
305                         log_debug("Journal file %s has unknown state %u.", f->path, state);
306                         return -EBUSY;
307                 }
308         }
309
310         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
311
312         f->seal = JOURNAL_HEADER_SEALED(f->header);
313
314         return 0;
315 }
316
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318         uint64_t old_size, new_size;
319         int r;
320
321         assert(f);
322
323         /* We assume that this file is not sparse, and we know that
324          * for sure, since we always call posix_fallocate()
325          * ourselves */
326
327         old_size =
328                 le64toh(f->header->header_size) +
329                 le64toh(f->header->arena_size);
330
331         new_size = PAGE_ALIGN(offset + size);
332         if (new_size < le64toh(f->header->header_size))
333                 new_size = le64toh(f->header->header_size);
334
335         if (new_size <= old_size)
336                 return 0;
337
338         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339                 return -E2BIG;
340
341         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Increase by larger blocks at once */
360         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362                 new_size = f->metrics.max_size;
363
364         /* Note that the glibc fallocate() fallback is very
365            inefficient, hence we try to minimize the allocation area
366            as we can. */
367         r = posix_fallocate(f->fd, old_size, new_size - old_size);
368         if (r != 0)
369                 return -r;
370
371         if (fstat(f->fd, &f->last_stat) < 0)
372                 return -errno;
373
374         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
375
376         return 0;
377 }
378
379 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
380         assert(f);
381         assert(ret);
382
383         if (size <= 0)
384                 return -EINVAL;
385
386         /* Avoid SIGBUS on invalid accesses */
387         if (offset + size > (uint64_t) f->last_stat.st_size) {
388                 /* Hmm, out of range? Let's refresh the fstat() data
389                  * first, before we trust that check. */
390
391                 if (fstat(f->fd, &f->last_stat) < 0 ||
392                     offset + size > (uint64_t) f->last_stat.st_size)
393                         return -EADDRNOTAVAIL;
394         }
395
396         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 }
398
399 static uint64_t minimum_header_size(Object *o) {
400
401         static const uint64_t table[] = {
402                 [OBJECT_DATA] = sizeof(DataObject),
403                 [OBJECT_FIELD] = sizeof(FieldObject),
404                 [OBJECT_ENTRY] = sizeof(EntryObject),
405                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408                 [OBJECT_TAG] = sizeof(TagObject),
409         };
410
411         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412                 return sizeof(ObjectHeader);
413
414         return table[o->object.type];
415 }
416
417 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
418         int r;
419         void *t;
420         Object *o;
421         uint64_t s;
422
423         assert(f);
424         assert(ret);
425
426         /* Objects may only be located at multiple of 64 bit */
427         if (!VALID64(offset))
428                 return -EFAULT;
429
430
431         r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
432         if (r < 0)
433                 return r;
434
435         o = (Object*) t;
436         s = le64toh(o->object.size);
437
438         if (s < sizeof(ObjectHeader))
439                 return -EBADMSG;
440
441         if (o->object.type <= OBJECT_UNUSED)
442                 return -EBADMSG;
443
444         if (s < minimum_header_size(o))
445                 return -EBADMSG;
446
447         if (type > 0 && o->object.type != type)
448                 return -EBADMSG;
449
450         if (s > sizeof(ObjectHeader)) {
451                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
452                 if (r < 0)
453                         return r;
454
455                 o = (Object*) t;
456         }
457
458         *ret = o;
459         return 0;
460 }
461
462 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
463         uint64_t r;
464
465         assert(f);
466
467         r = le64toh(f->header->tail_entry_seqnum) + 1;
468
469         if (seqnum) {
470                 /* If an external seqnum counter was passed, we update
471                  * both the local and the external one, and set it to
472                  * the maximum of both */
473
474                 if (*seqnum + 1 > r)
475                         r = *seqnum + 1;
476
477                 *seqnum = r;
478         }
479
480         f->header->tail_entry_seqnum = htole64(r);
481
482         if (f->header->head_entry_seqnum == 0)
483                 f->header->head_entry_seqnum = htole64(r);
484
485         return r;
486 }
487
488 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
489         int r;
490         uint64_t p;
491         Object *tail, *o;
492         void *t;
493
494         assert(f);
495         assert(type > 0 && type < _OBJECT_TYPE_MAX);
496         assert(size >= sizeof(ObjectHeader));
497         assert(offset);
498         assert(ret);
499
500         r = journal_file_set_online(f);
501         if (r < 0)
502                 return r;
503
504         p = le64toh(f->header->tail_object_offset);
505         if (p == 0)
506                 p = le64toh(f->header->header_size);
507         else {
508                 r = journal_file_move_to_object(f, -1, p, &tail);
509                 if (r < 0)
510                         return r;
511
512                 p += ALIGN64(le64toh(tail->object.size));
513         }
514
515         r = journal_file_allocate(f, p, size);
516         if (r < 0)
517                 return r;
518
519         r = journal_file_move_to(f, type, false, p, size, &t);
520         if (r < 0)
521                 return r;
522
523         o = (Object*) t;
524
525         zero(o->object);
526         o->object.type = type;
527         o->object.size = htole64(size);
528
529         f->header->tail_object_offset = htole64(p);
530         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
531
532         *ret = o;
533         *offset = p;
534
535         return 0;
536 }
537
538 static int journal_file_setup_data_hash_table(JournalFile *f) {
539         uint64_t s, p;
540         Object *o;
541         int r;
542
543         assert(f);
544
545         /* We estimate that we need 1 hash table entry per 768 of
546            journal file and we want to make sure we never get beyond
547            75% fill level. Calculate the hash table size for the
548            maximum file size based on these metrics. */
549
550         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
551         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
552                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
553
554         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
555
556         r = journal_file_append_object(f,
557                                        OBJECT_DATA_HASH_TABLE,
558                                        offsetof(Object, hash_table.items) + s,
559                                        &o, &p);
560         if (r < 0)
561                 return r;
562
563         memzero(o->hash_table.items, s);
564
565         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
566         f->header->data_hash_table_size = htole64(s);
567
568         return 0;
569 }
570
571 static int journal_file_setup_field_hash_table(JournalFile *f) {
572         uint64_t s, p;
573         Object *o;
574         int r;
575
576         assert(f);
577
578         /* We use a fixed size hash table for the fields as this
579          * number should grow very slowly only */
580
581         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
582         r = journal_file_append_object(f,
583                                        OBJECT_FIELD_HASH_TABLE,
584                                        offsetof(Object, hash_table.items) + s,
585                                        &o, &p);
586         if (r < 0)
587                 return r;
588
589         memzero(o->hash_table.items, s);
590
591         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
592         f->header->field_hash_table_size = htole64(s);
593
594         return 0;
595 }
596
597 static int journal_file_map_data_hash_table(JournalFile *f) {
598         uint64_t s, p;
599         void *t;
600         int r;
601
602         assert(f);
603
604         p = le64toh(f->header->data_hash_table_offset);
605         s = le64toh(f->header->data_hash_table_size);
606
607         r = journal_file_move_to(f,
608                                  OBJECT_DATA_HASH_TABLE,
609                                  true,
610                                  p, s,
611                                  &t);
612         if (r < 0)
613                 return r;
614
615         f->data_hash_table = t;
616         return 0;
617 }
618
619 static int journal_file_map_field_hash_table(JournalFile *f) {
620         uint64_t s, p;
621         void *t;
622         int r;
623
624         assert(f);
625
626         p = le64toh(f->header->field_hash_table_offset);
627         s = le64toh(f->header->field_hash_table_size);
628
629         r = journal_file_move_to(f,
630                                  OBJECT_FIELD_HASH_TABLE,
631                                  true,
632                                  p, s,
633                                  &t);
634         if (r < 0)
635                 return r;
636
637         f->field_hash_table = t;
638         return 0;
639 }
640
641 static int journal_file_link_field(
642                 JournalFile *f,
643                 Object *o,
644                 uint64_t offset,
645                 uint64_t hash) {
646
647         uint64_t p, h;
648         int r;
649
650         assert(f);
651         assert(o);
652         assert(offset > 0);
653
654         if (o->object.type != OBJECT_FIELD)
655                 return -EINVAL;
656
657         /* This might alter the window we are looking at */
658
659         o->field.next_hash_offset = o->field.head_data_offset = 0;
660
661         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
662         p = le64toh(f->field_hash_table[h].tail_hash_offset);
663         if (p == 0)
664                 f->field_hash_table[h].head_hash_offset = htole64(offset);
665         else {
666                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
667                 if (r < 0)
668                         return r;
669
670                 o->field.next_hash_offset = htole64(offset);
671         }
672
673         f->field_hash_table[h].tail_hash_offset = htole64(offset);
674
675         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
676                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
677
678         return 0;
679 }
680
681 static int journal_file_link_data(
682                 JournalFile *f,
683                 Object *o,
684                 uint64_t offset,
685                 uint64_t hash) {
686
687         uint64_t p, h;
688         int r;
689
690         assert(f);
691         assert(o);
692         assert(offset > 0);
693
694         if (o->object.type != OBJECT_DATA)
695                 return -EINVAL;
696
697         /* This might alter the window we are looking at */
698
699         o->data.next_hash_offset = o->data.next_field_offset = 0;
700         o->data.entry_offset = o->data.entry_array_offset = 0;
701         o->data.n_entries = 0;
702
703         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
704         p = le64toh(f->data_hash_table[h].tail_hash_offset);
705         if (p == 0)
706                 /* Only entry in the hash table is easy */
707                 f->data_hash_table[h].head_hash_offset = htole64(offset);
708         else {
709                 /* Move back to the previous data object, to patch in
710                  * pointer */
711
712                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
713                 if (r < 0)
714                         return r;
715
716                 o->data.next_hash_offset = htole64(offset);
717         }
718
719         f->data_hash_table[h].tail_hash_offset = htole64(offset);
720
721         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
722                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
723
724         return 0;
725 }
726
727 int journal_file_find_field_object_with_hash(
728                 JournalFile *f,
729                 const void *field, uint64_t size, uint64_t hash,
730                 Object **ret, uint64_t *offset) {
731
732         uint64_t p, osize, h;
733         int r;
734
735         assert(f);
736         assert(field && size > 0);
737
738         osize = offsetof(Object, field.payload) + size;
739
740         if (f->header->field_hash_table_size == 0)
741                 return -EBADMSG;
742
743         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
744         p = le64toh(f->field_hash_table[h].head_hash_offset);
745
746         while (p > 0) {
747                 Object *o;
748
749                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
750                 if (r < 0)
751                         return r;
752
753                 if (le64toh(o->field.hash) == hash &&
754                     le64toh(o->object.size) == osize &&
755                     memcmp(o->field.payload, field, size) == 0) {
756
757                         if (ret)
758                                 *ret = o;
759                         if (offset)
760                                 *offset = p;
761
762                         return 1;
763                 }
764
765                 p = le64toh(o->field.next_hash_offset);
766         }
767
768         return 0;
769 }
770
771 int journal_file_find_field_object(
772                 JournalFile *f,
773                 const void *field, uint64_t size,
774                 Object **ret, uint64_t *offset) {
775
776         uint64_t hash;
777
778         assert(f);
779         assert(field && size > 0);
780
781         hash = hash64(field, size);
782
783         return journal_file_find_field_object_with_hash(f,
784                                                         field, size, hash,
785                                                         ret, offset);
786 }
787
788 int journal_file_find_data_object_with_hash(
789                 JournalFile *f,
790                 const void *data, uint64_t size, uint64_t hash,
791                 Object **ret, uint64_t *offset) {
792
793         uint64_t p, osize, h;
794         int r;
795
796         assert(f);
797         assert(data || size == 0);
798
799         osize = offsetof(Object, data.payload) + size;
800
801         if (f->header->data_hash_table_size == 0)
802                 return -EBADMSG;
803
804         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
805         p = le64toh(f->data_hash_table[h].head_hash_offset);
806
807         while (p > 0) {
808                 Object *o;
809
810                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
811                 if (r < 0)
812                         return r;
813
814                 if (le64toh(o->data.hash) != hash)
815                         goto next;
816
817                 if (o->object.flags & OBJECT_COMPRESSED) {
818 #ifdef HAVE_XZ
819                         uint64_t l, rsize;
820
821                         l = le64toh(o->object.size);
822                         if (l <= offsetof(Object, data.payload))
823                                 return -EBADMSG;
824
825                         l -= offsetof(Object, data.payload);
826
827                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
828                                 return -EBADMSG;
829
830                         if (rsize == size &&
831                             memcmp(f->compress_buffer, data, size) == 0) {
832
833                                 if (ret)
834                                         *ret = o;
835
836                                 if (offset)
837                                         *offset = p;
838
839                                 return 1;
840                         }
841 #else
842                         return -EPROTONOSUPPORT;
843 #endif
844
845                 } else if (le64toh(o->object.size) == osize &&
846                            memcmp(o->data.payload, data, size) == 0) {
847
848                         if (ret)
849                                 *ret = o;
850
851                         if (offset)
852                                 *offset = p;
853
854                         return 1;
855                 }
856
857         next:
858                 p = le64toh(o->data.next_hash_offset);
859         }
860
861         return 0;
862 }
863
864 int journal_file_find_data_object(
865                 JournalFile *f,
866                 const void *data, uint64_t size,
867                 Object **ret, uint64_t *offset) {
868
869         uint64_t hash;
870
871         assert(f);
872         assert(data || size == 0);
873
874         hash = hash64(data, size);
875
876         return journal_file_find_data_object_with_hash(f,
877                                                        data, size, hash,
878                                                        ret, offset);
879 }
880
881 static int journal_file_append_field(
882                 JournalFile *f,
883                 const void *field, uint64_t size,
884                 Object **ret, uint64_t *offset) {
885
886         uint64_t hash, p;
887         uint64_t osize;
888         Object *o;
889         int r;
890
891         assert(f);
892         assert(field && size > 0);
893
894         hash = hash64(field, size);
895
896         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
897         if (r < 0)
898                 return r;
899         else if (r > 0) {
900
901                 if (ret)
902                         *ret = o;
903
904                 if (offset)
905                         *offset = p;
906
907                 return 0;
908         }
909
910         osize = offsetof(Object, field.payload) + size;
911         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
912         if (r < 0)
913                 return r;
914
915         o->field.hash = htole64(hash);
916         memcpy(o->field.payload, field, size);
917
918         r = journal_file_link_field(f, o, p, hash);
919         if (r < 0)
920                 return r;
921
922         /* The linking might have altered the window, so let's
923          * refresh our pointer */
924         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
925         if (r < 0)
926                 return r;
927
928 #ifdef HAVE_GCRYPT
929         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
930         if (r < 0)
931                 return r;
932 #endif
933
934         if (ret)
935                 *ret = o;
936
937         if (offset)
938                 *offset = p;
939
940         return 0;
941 }
942
943 static int journal_file_append_data(
944                 JournalFile *f,
945                 const void *data, uint64_t size,
946                 Object **ret, uint64_t *offset) {
947
948         uint64_t hash, p;
949         uint64_t osize;
950         Object *o;
951         int r;
952         bool compressed = false;
953         const void *eq;
954
955         assert(f);
956         assert(data || size == 0);
957
958         hash = hash64(data, size);
959
960         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
961         if (r < 0)
962                 return r;
963         else if (r > 0) {
964
965                 if (ret)
966                         *ret = o;
967
968                 if (offset)
969                         *offset = p;
970
971                 return 0;
972         }
973
974         osize = offsetof(Object, data.payload) + size;
975         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
976         if (r < 0)
977                 return r;
978
979         o->data.hash = htole64(hash);
980
981 #ifdef HAVE_XZ
982         if (f->compress &&
983             size >= COMPRESSION_SIZE_THRESHOLD) {
984                 uint64_t rsize;
985
986                 compressed = compress_blob(data, size, o->data.payload, &rsize);
987
988                 if (compressed) {
989                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
990                         o->object.flags |= OBJECT_COMPRESSED;
991
992                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
993                 }
994         }
995 #endif
996
997         if (!compressed && size > 0)
998                 memcpy(o->data.payload, data, size);
999
1000         r = journal_file_link_data(f, o, p, hash);
1001         if (r < 0)
1002                 return r;
1003
1004         /* The linking might have altered the window, so let's
1005          * refresh our pointer */
1006         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1007         if (r < 0)
1008                 return r;
1009
1010         if (!data)
1011                 eq = NULL;
1012         else
1013                 eq = memchr(data, '=', size);
1014         if (eq && eq > data) {
1015                 Object *fo = NULL;
1016                 uint64_t fp;
1017
1018                 /* Create field object ... */
1019                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1020                 if (r < 0)
1021                         return r;
1022
1023                 /* ... and link it in. */
1024                 o->data.next_field_offset = fo->field.head_data_offset;
1025                 fo->field.head_data_offset = le64toh(p);
1026         }
1027
1028 #ifdef HAVE_GCRYPT
1029         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1030         if (r < 0)
1031                 return r;
1032 #endif
1033
1034         if (ret)
1035                 *ret = o;
1036
1037         if (offset)
1038                 *offset = p;
1039
1040         return 0;
1041 }
1042
1043 uint64_t journal_file_entry_n_items(Object *o) {
1044         assert(o);
1045
1046         if (o->object.type != OBJECT_ENTRY)
1047                 return 0;
1048
1049         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1050 }
1051
1052 uint64_t journal_file_entry_array_n_items(Object *o) {
1053         assert(o);
1054
1055         if (o->object.type != OBJECT_ENTRY_ARRAY)
1056                 return 0;
1057
1058         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1059 }
1060
1061 uint64_t journal_file_hash_table_n_items(Object *o) {
1062         assert(o);
1063
1064         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1065             o->object.type != OBJECT_FIELD_HASH_TABLE)
1066                 return 0;
1067
1068         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1069 }
1070
1071 static int link_entry_into_array(JournalFile *f,
1072                                  le64_t *first,
1073                                  le64_t *idx,
1074                                  uint64_t p) {
1075         int r;
1076         uint64_t n = 0, ap = 0, q, i, a, hidx;
1077         Object *o;
1078
1079         assert(f);
1080         assert(first);
1081         assert(idx);
1082         assert(p > 0);
1083
1084         a = le64toh(*first);
1085         i = hidx = le64toh(*idx);
1086         while (a > 0) {
1087
1088                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1089                 if (r < 0)
1090                         return r;
1091
1092                 n = journal_file_entry_array_n_items(o);
1093                 if (i < n) {
1094                         o->entry_array.items[i] = htole64(p);
1095                         *idx = htole64(hidx + 1);
1096                         return 0;
1097                 }
1098
1099                 i -= n;
1100                 ap = a;
1101                 a = le64toh(o->entry_array.next_entry_array_offset);
1102         }
1103
1104         if (hidx > n)
1105                 n = (hidx+1) * 2;
1106         else
1107                 n = n * 2;
1108
1109         if (n < 4)
1110                 n = 4;
1111
1112         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1113                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1114                                        &o, &q);
1115         if (r < 0)
1116                 return r;
1117
1118 #ifdef HAVE_GCRYPT
1119         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1120         if (r < 0)
1121                 return r;
1122 #endif
1123
1124         o->entry_array.items[i] = htole64(p);
1125
1126         if (ap == 0)
1127                 *first = htole64(q);
1128         else {
1129                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1130                 if (r < 0)
1131                         return r;
1132
1133                 o->entry_array.next_entry_array_offset = htole64(q);
1134         }
1135
1136         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1137                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1138
1139         *idx = htole64(hidx + 1);
1140
1141         return 0;
1142 }
1143
1144 static int link_entry_into_array_plus_one(JournalFile *f,
1145                                           le64_t *extra,
1146                                           le64_t *first,
1147                                           le64_t *idx,
1148                                           uint64_t p) {
1149
1150         int r;
1151
1152         assert(f);
1153         assert(extra);
1154         assert(first);
1155         assert(idx);
1156         assert(p > 0);
1157
1158         if (*idx == 0)
1159                 *extra = htole64(p);
1160         else {
1161                 le64_t i;
1162
1163                 i = htole64(le64toh(*idx) - 1);
1164                 r = link_entry_into_array(f, first, &i, p);
1165                 if (r < 0)
1166                         return r;
1167         }
1168
1169         *idx = htole64(le64toh(*idx) + 1);
1170         return 0;
1171 }
1172
1173 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1174         uint64_t p;
1175         int r;
1176         assert(f);
1177         assert(o);
1178         assert(offset > 0);
1179
1180         p = le64toh(o->entry.items[i].object_offset);
1181         if (p == 0)
1182                 return -EINVAL;
1183
1184         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1185         if (r < 0)
1186                 return r;
1187
1188         return link_entry_into_array_plus_one(f,
1189                                               &o->data.entry_offset,
1190                                               &o->data.entry_array_offset,
1191                                               &o->data.n_entries,
1192                                               offset);
1193 }
1194
1195 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1196         uint64_t n, i;
1197         int r;
1198
1199         assert(f);
1200         assert(o);
1201         assert(offset > 0);
1202
1203         if (o->object.type != OBJECT_ENTRY)
1204                 return -EINVAL;
1205
1206         __sync_synchronize();
1207
1208         /* Link up the entry itself */
1209         r = link_entry_into_array(f,
1210                                   &f->header->entry_array_offset,
1211                                   &f->header->n_entries,
1212                                   offset);
1213         if (r < 0)
1214                 return r;
1215
1216         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1217
1218         if (f->header->head_entry_realtime == 0)
1219                 f->header->head_entry_realtime = o->entry.realtime;
1220
1221         f->header->tail_entry_realtime = o->entry.realtime;
1222         f->header->tail_entry_monotonic = o->entry.monotonic;
1223
1224         f->tail_entry_monotonic_valid = true;
1225
1226         /* Link up the items */
1227         n = journal_file_entry_n_items(o);
1228         for (i = 0; i < n; i++) {
1229                 r = journal_file_link_entry_item(f, o, offset, i);
1230                 if (r < 0)
1231                         return r;
1232         }
1233
1234         return 0;
1235 }
1236
1237 static int journal_file_append_entry_internal(
1238                 JournalFile *f,
1239                 const dual_timestamp *ts,
1240                 uint64_t xor_hash,
1241                 const EntryItem items[], unsigned n_items,
1242                 uint64_t *seqnum,
1243                 Object **ret, uint64_t *offset) {
1244         uint64_t np;
1245         uint64_t osize;
1246         Object *o;
1247         int r;
1248
1249         assert(f);
1250         assert(items || n_items == 0);
1251         assert(ts);
1252
1253         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1254
1255         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1256         if (r < 0)
1257                 return r;
1258
1259         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1260         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1261         o->entry.realtime = htole64(ts->realtime);
1262         o->entry.monotonic = htole64(ts->monotonic);
1263         o->entry.xor_hash = htole64(xor_hash);
1264         o->entry.boot_id = f->header->boot_id;
1265
1266 #ifdef HAVE_GCRYPT
1267         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1268         if (r < 0)
1269                 return r;
1270 #endif
1271
1272         r = journal_file_link_entry(f, o, np);
1273         if (r < 0)
1274                 return r;
1275
1276         if (ret)
1277                 *ret = o;
1278
1279         if (offset)
1280                 *offset = np;
1281
1282         return 0;
1283 }
1284
1285 void journal_file_post_change(JournalFile *f) {
1286         assert(f);
1287
1288         /* inotify() does not receive IN_MODIFY events from file
1289          * accesses done via mmap(). After each access we hence
1290          * trigger IN_MODIFY by truncating the journal file to its
1291          * current size which triggers IN_MODIFY. */
1292
1293         __sync_synchronize();
1294
1295         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1296                 log_error("Failed to truncate file to its own size: %m");
1297 }
1298
1299 static int entry_item_cmp(const void *_a, const void *_b) {
1300         const EntryItem *a = _a, *b = _b;
1301
1302         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1303                 return -1;
1304         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1305                 return 1;
1306         return 0;
1307 }
1308
1309 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1310         unsigned i;
1311         EntryItem *items;
1312         int r;
1313         uint64_t xor_hash = 0;
1314         struct dual_timestamp _ts;
1315
1316         assert(f);
1317         assert(iovec || n_iovec == 0);
1318
1319         if (!ts) {
1320                 dual_timestamp_get(&_ts);
1321                 ts = &_ts;
1322         }
1323
1324         if (f->tail_entry_monotonic_valid &&
1325             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1326                 return -EINVAL;
1327
1328 #ifdef HAVE_GCRYPT
1329         r = journal_file_maybe_append_tag(f, ts->realtime);
1330         if (r < 0)
1331                 return r;
1332 #endif
1333
1334         /* alloca() can't take 0, hence let's allocate at least one */
1335         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1336
1337         for (i = 0; i < n_iovec; i++) {
1338                 uint64_t p;
1339                 Object *o;
1340
1341                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1342                 if (r < 0)
1343                         return r;
1344
1345                 xor_hash ^= le64toh(o->data.hash);
1346                 items[i].object_offset = htole64(p);
1347                 items[i].hash = o->data.hash;
1348         }
1349
1350         /* Order by the position on disk, in order to improve seek
1351          * times for rotating media. */
1352         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1353
1354         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1355
1356         journal_file_post_change(f);
1357
1358         return r;
1359 }
1360
1361 typedef struct ChainCacheItem {
1362         uint64_t first; /* the array at the beginning of the chain */
1363         uint64_t array; /* the cached array */
1364         uint64_t begin; /* the first item in the cached array */
1365         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1366         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1367 } ChainCacheItem;
1368
1369 static void chain_cache_put(
1370                 Hashmap *h,
1371                 ChainCacheItem *ci,
1372                 uint64_t first,
1373                 uint64_t array,
1374                 uint64_t begin,
1375                 uint64_t total,
1376                 uint64_t last_index) {
1377
1378         if (!ci) {
1379                 /* If the chain item to cache for this chain is the
1380                  * first one it's not worth caching anything */
1381                 if (array == first)
1382                         return;
1383
1384                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1385                         ci = hashmap_steal_first(h);
1386                 else {
1387                         ci = new(ChainCacheItem, 1);
1388                         if (!ci)
1389                                 return;
1390                 }
1391
1392                 ci->first = first;
1393
1394                 if (hashmap_put(h, &ci->first, ci) < 0) {
1395                         free(ci);
1396                         return;
1397                 }
1398         } else
1399                 assert(ci->first == first);
1400
1401         ci->array = array;
1402         ci->begin = begin;
1403         ci->total = total;
1404         ci->last_index = last_index;
1405 }
1406
1407 static int generic_array_get(
1408                 JournalFile *f,
1409                 uint64_t first,
1410                 uint64_t i,
1411                 Object **ret, uint64_t *offset) {
1412
1413         Object *o;
1414         uint64_t p = 0, a, t = 0;
1415         int r;
1416         ChainCacheItem *ci;
1417
1418         assert(f);
1419
1420         a = first;
1421
1422         /* Try the chain cache first */
1423         ci = hashmap_get(f->chain_cache, &first);
1424         if (ci && i > ci->total) {
1425                 a = ci->array;
1426                 i -= ci->total;
1427                 t = ci->total;
1428         }
1429
1430         while (a > 0) {
1431                 uint64_t k;
1432
1433                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1434                 if (r < 0)
1435                         return r;
1436
1437                 k = journal_file_entry_array_n_items(o);
1438                 if (i < k) {
1439                         p = le64toh(o->entry_array.items[i]);
1440                         goto found;
1441                 }
1442
1443                 i -= k;
1444                 t += k;
1445                 a = le64toh(o->entry_array.next_entry_array_offset);
1446         }
1447
1448         return 0;
1449
1450 found:
1451         /* Let's cache this item for the next invocation */
1452         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1453
1454         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455         if (r < 0)
1456                 return r;
1457
1458         if (ret)
1459                 *ret = o;
1460
1461         if (offset)
1462                 *offset = p;
1463
1464         return 1;
1465 }
1466
1467 static int generic_array_get_plus_one(
1468                 JournalFile *f,
1469                 uint64_t extra,
1470                 uint64_t first,
1471                 uint64_t i,
1472                 Object **ret, uint64_t *offset) {
1473
1474         Object *o;
1475
1476         assert(f);
1477
1478         if (i == 0) {
1479                 int r;
1480
1481                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1482                 if (r < 0)
1483                         return r;
1484
1485                 if (ret)
1486                         *ret = o;
1487
1488                 if (offset)
1489                         *offset = extra;
1490
1491                 return 1;
1492         }
1493
1494         return generic_array_get(f, first, i-1, ret, offset);
1495 }
1496
1497 enum {
1498         TEST_FOUND,
1499         TEST_LEFT,
1500         TEST_RIGHT
1501 };
1502
1503 static int generic_array_bisect(
1504                 JournalFile *f,
1505                 uint64_t first,
1506                 uint64_t n,
1507                 uint64_t needle,
1508                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509                 direction_t direction,
1510                 Object **ret,
1511                 uint64_t *offset,
1512                 uint64_t *idx) {
1513
1514         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1515         bool subtract_one = false;
1516         Object *o, *array = NULL;
1517         int r;
1518         ChainCacheItem *ci;
1519
1520         assert(f);
1521         assert(test_object);
1522
1523         /* Start with the first array in the chain */
1524         a = first;
1525
1526         ci = hashmap_get(f->chain_cache, &first);
1527         if (ci && n > ci->total) {
1528                 /* Ah, we have iterated this bisection array chain
1529                  * previously! Let's see if we can skip ahead in the
1530                  * chain, as far as the last time. But we can't jump
1531                  * backwards in the chain, so let's check that
1532                  * first. */
1533
1534                 r = test_object(f, ci->begin, needle);
1535                 if (r < 0)
1536                         return r;
1537
1538                 if (r == TEST_LEFT) {
1539                         /* OK, what we are looking for is right of the
1540                          * begin of this EntryArray, so let's jump
1541                          * straight to previously cached array in the
1542                          * chain */
1543
1544                         a = ci->array;
1545                         n -= ci->total;
1546                         t = ci->total;
1547                         last_index = ci->last_index;
1548                 }
1549         }
1550
1551         while (a > 0) {
1552                 uint64_t left, right, k, lp;
1553
1554                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1555                 if (r < 0)
1556                         return r;
1557
1558                 k = journal_file_entry_array_n_items(array);
1559                 right = MIN(k, n);
1560                 if (right <= 0)
1561                         return 0;
1562
1563                 i = right - 1;
1564                 lp = p = le64toh(array->entry_array.items[i]);
1565                 if (p <= 0)
1566                         return -EBADMSG;
1567
1568                 r = test_object(f, p, needle);
1569                 if (r < 0)
1570                         return r;
1571
1572                 if (r == TEST_FOUND)
1573                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574
1575                 if (r == TEST_RIGHT) {
1576                         left = 0;
1577                         right -= 1;
1578
1579                         if (last_index != (uint64_t) -1) {
1580                                 assert(last_index <= right);
1581
1582                                 /* If we cached the last index we
1583                                  * looked at, let's try to not to jump
1584                                  * too wildly around and see if we can
1585                                  * limit the range to look at early to
1586                                  * the immediate neighbors of the last
1587                                  * index we looked at. */
1588
1589                                 if (last_index > 0) {
1590                                         uint64_t x = last_index - 1;
1591
1592                                         p = le64toh(array->entry_array.items[x]);
1593                                         if (p <= 0)
1594                                                 return -EBADMSG;
1595
1596                                         r = test_object(f, p, needle);
1597                                         if (r < 0)
1598                                                 return r;
1599
1600                                         if (r == TEST_FOUND)
1601                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602
1603                                         if (r == TEST_RIGHT)
1604                                                 right = x;
1605                                         else
1606                                                 left = x + 1;
1607                                 }
1608
1609                                 if (last_index < right) {
1610                                         uint64_t y = last_index + 1;
1611
1612                                         p = le64toh(array->entry_array.items[y]);
1613                                         if (p <= 0)
1614                                                 return -EBADMSG;
1615
1616                                         r = test_object(f, p, needle);
1617                                         if (r < 0)
1618                                                 return r;
1619
1620                                         if (r == TEST_FOUND)
1621                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622
1623                                         if (r == TEST_RIGHT)
1624                                                 right = y;
1625                                         else
1626                                                 left = y + 1;
1627                                 }
1628                         }
1629
1630                         for (;;) {
1631                                 if (left == right) {
1632                                         if (direction == DIRECTION_UP)
1633                                                 subtract_one = true;
1634
1635                                         i = left;
1636                                         goto found;
1637                                 }
1638
1639                                 assert(left < right);
1640                                 i = (left + right) / 2;
1641
1642                                 p = le64toh(array->entry_array.items[i]);
1643                                 if (p <= 0)
1644                                         return -EBADMSG;
1645
1646                                 r = test_object(f, p, needle);
1647                                 if (r < 0)
1648                                         return r;
1649
1650                                 if (r == TEST_FOUND)
1651                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1652
1653                                 if (r == TEST_RIGHT)
1654                                         right = i;
1655                                 else
1656                                         left = i + 1;
1657                         }
1658                 }
1659
1660                 if (k > n) {
1661                         if (direction == DIRECTION_UP) {
1662                                 i = n;
1663                                 subtract_one = true;
1664                                 goto found;
1665                         }
1666
1667                         return 0;
1668                 }
1669
1670                 last_p = lp;
1671
1672                 n -= k;
1673                 t += k;
1674                 last_index = (uint64_t) -1;
1675                 a = le64toh(array->entry_array.next_entry_array_offset);
1676         }
1677
1678         return 0;
1679
1680 found:
1681         if (subtract_one && t == 0 && i == 0)
1682                 return 0;
1683
1684         /* Let's cache this item for the next invocation */
1685         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1686
1687         if (subtract_one && i == 0)
1688                 p = last_p;
1689         else if (subtract_one)
1690                 p = le64toh(array->entry_array.items[i-1]);
1691         else
1692                 p = le64toh(array->entry_array.items[i]);
1693
1694         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1695         if (r < 0)
1696                 return r;
1697
1698         if (ret)
1699                 *ret = o;
1700
1701         if (offset)
1702                 *offset = p;
1703
1704         if (idx)
1705                 *idx = t + i + (subtract_one ? -1 : 0);
1706
1707         return 1;
1708 }
1709
1710
1711 static int generic_array_bisect_plus_one(
1712                 JournalFile *f,
1713                 uint64_t extra,
1714                 uint64_t first,
1715                 uint64_t n,
1716                 uint64_t needle,
1717                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1718                 direction_t direction,
1719                 Object **ret,
1720                 uint64_t *offset,
1721                 uint64_t *idx) {
1722
1723         int r;
1724         bool step_back = false;
1725         Object *o;
1726
1727         assert(f);
1728         assert(test_object);
1729
1730         if (n <= 0)
1731                 return 0;
1732
1733         /* This bisects the array in object 'first', but first checks
1734          * an extra  */
1735         r = test_object(f, extra, needle);
1736         if (r < 0)
1737                 return r;
1738
1739         if (r == TEST_FOUND)
1740                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1741
1742         /* if we are looking with DIRECTION_UP then we need to first
1743            see if in the actual array there is a matching entry, and
1744            return the last one of that. But if there isn't any we need
1745            to return this one. Hence remember this, and return it
1746            below. */
1747         if (r == TEST_LEFT)
1748                 step_back = direction == DIRECTION_UP;
1749
1750         if (r == TEST_RIGHT) {
1751                 if (direction == DIRECTION_DOWN)
1752                         goto found;
1753                 else
1754                         return 0;
1755         }
1756
1757         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1758
1759         if (r == 0 && step_back)
1760                 goto found;
1761
1762         if (r > 0 && idx)
1763                 (*idx) ++;
1764
1765         return r;
1766
1767 found:
1768         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1769         if (r < 0)
1770                 return r;
1771
1772         if (ret)
1773                 *ret = o;
1774
1775         if (offset)
1776                 *offset = extra;
1777
1778         if (idx)
1779                 *idx = 0;
1780
1781         return 1;
1782 }
1783
1784 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1785         assert(f);
1786         assert(p > 0);
1787
1788         if (p == needle)
1789                 return TEST_FOUND;
1790         else if (p < needle)
1791                 return TEST_LEFT;
1792         else
1793                 return TEST_RIGHT;
1794 }
1795
1796 int journal_file_move_to_entry_by_offset(
1797                 JournalFile *f,
1798                 uint64_t p,
1799                 direction_t direction,
1800                 Object **ret,
1801                 uint64_t *offset) {
1802
1803         return generic_array_bisect(f,
1804                                     le64toh(f->header->entry_array_offset),
1805                                     le64toh(f->header->n_entries),
1806                                     p,
1807                                     test_object_offset,
1808                                     direction,
1809                                     ret, offset, NULL);
1810 }
1811
1812
1813 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1814         Object *o;
1815         int r;
1816
1817         assert(f);
1818         assert(p > 0);
1819
1820         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1821         if (r < 0)
1822                 return r;
1823
1824         if (le64toh(o->entry.seqnum) == needle)
1825                 return TEST_FOUND;
1826         else if (le64toh(o->entry.seqnum) < needle)
1827                 return TEST_LEFT;
1828         else
1829                 return TEST_RIGHT;
1830 }
1831
1832 int journal_file_move_to_entry_by_seqnum(
1833                 JournalFile *f,
1834                 uint64_t seqnum,
1835                 direction_t direction,
1836                 Object **ret,
1837                 uint64_t *offset) {
1838
1839         return generic_array_bisect(f,
1840                                     le64toh(f->header->entry_array_offset),
1841                                     le64toh(f->header->n_entries),
1842                                     seqnum,
1843                                     test_object_seqnum,
1844                                     direction,
1845                                     ret, offset, NULL);
1846 }
1847
1848 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1849         Object *o;
1850         int r;
1851
1852         assert(f);
1853         assert(p > 0);
1854
1855         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1856         if (r < 0)
1857                 return r;
1858
1859         if (le64toh(o->entry.realtime) == needle)
1860                 return TEST_FOUND;
1861         else if (le64toh(o->entry.realtime) < needle)
1862                 return TEST_LEFT;
1863         else
1864                 return TEST_RIGHT;
1865 }
1866
1867 int journal_file_move_to_entry_by_realtime(
1868                 JournalFile *f,
1869                 uint64_t realtime,
1870                 direction_t direction,
1871                 Object **ret,
1872                 uint64_t *offset) {
1873
1874         return generic_array_bisect(f,
1875                                     le64toh(f->header->entry_array_offset),
1876                                     le64toh(f->header->n_entries),
1877                                     realtime,
1878                                     test_object_realtime,
1879                                     direction,
1880                                     ret, offset, NULL);
1881 }
1882
1883 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1884         Object *o;
1885         int r;
1886
1887         assert(f);
1888         assert(p > 0);
1889
1890         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1891         if (r < 0)
1892                 return r;
1893
1894         if (le64toh(o->entry.monotonic) == needle)
1895                 return TEST_FOUND;
1896         else if (le64toh(o->entry.monotonic) < needle)
1897                 return TEST_LEFT;
1898         else
1899                 return TEST_RIGHT;
1900 }
1901
1902 static inline int find_data_object_by_boot_id(
1903                 JournalFile *f,
1904                 sd_id128_t boot_id,
1905                 Object **o,
1906                 uint64_t *b) {
1907         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1908
1909         sd_id128_to_string(boot_id, t + 9);
1910         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1911 }
1912
1913 int journal_file_move_to_entry_by_monotonic(
1914                 JournalFile *f,
1915                 sd_id128_t boot_id,
1916                 uint64_t monotonic,
1917                 direction_t direction,
1918                 Object **ret,
1919                 uint64_t *offset) {
1920
1921         Object *o;
1922         int r;
1923
1924         assert(f);
1925
1926         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1927         if (r < 0)
1928                 return r;
1929         if (r == 0)
1930                 return -ENOENT;
1931
1932         return generic_array_bisect_plus_one(f,
1933                                              le64toh(o->data.entry_offset),
1934                                              le64toh(o->data.entry_array_offset),
1935                                              le64toh(o->data.n_entries),
1936                                              monotonic,
1937                                              test_object_monotonic,
1938                                              direction,
1939                                              ret, offset, NULL);
1940 }
1941
1942 int journal_file_next_entry(
1943                 JournalFile *f,
1944                 Object *o, uint64_t p,
1945                 direction_t direction,
1946                 Object **ret, uint64_t *offset) {
1947
1948         uint64_t i, n, ofs;
1949         int r;
1950
1951         assert(f);
1952         assert(p > 0 || !o);
1953
1954         n = le64toh(f->header->n_entries);
1955         if (n <= 0)
1956                 return 0;
1957
1958         if (!o)
1959                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1960         else {
1961                 if (o->object.type != OBJECT_ENTRY)
1962                         return -EINVAL;
1963
1964                 r = generic_array_bisect(f,
1965                                          le64toh(f->header->entry_array_offset),
1966                                          le64toh(f->header->n_entries),
1967                                          p,
1968                                          test_object_offset,
1969                                          DIRECTION_DOWN,
1970                                          NULL, NULL,
1971                                          &i);
1972                 if (r <= 0)
1973                         return r;
1974
1975                 if (direction == DIRECTION_DOWN) {
1976                         if (i >= n - 1)
1977                                 return 0;
1978
1979                         i++;
1980                 } else {
1981                         if (i <= 0)
1982                                 return 0;
1983
1984                         i--;
1985                 }
1986         }
1987
1988         /* And jump to it */
1989         r = generic_array_get(f,
1990                               le64toh(f->header->entry_array_offset),
1991                               i,
1992                               ret, &ofs);
1993         if (r <= 0)
1994                 return r;
1995
1996         if (p > 0 &&
1997             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1998                 log_debug("%s: entry array corrupted at entry %"PRIu64,
1999                           f->path, i);
2000                 return -EBADMSG;
2001         }
2002
2003         if (offset)
2004                 *offset = ofs;
2005
2006         return 1;
2007 }
2008
2009 int journal_file_skip_entry(
2010                 JournalFile *f,
2011                 Object *o, uint64_t p,
2012                 int64_t skip,
2013                 Object **ret, uint64_t *offset) {
2014
2015         uint64_t i, n;
2016         int r;
2017
2018         assert(f);
2019         assert(o);
2020         assert(p > 0);
2021
2022         if (o->object.type != OBJECT_ENTRY)
2023                 return -EINVAL;
2024
2025         r = generic_array_bisect(f,
2026                                  le64toh(f->header->entry_array_offset),
2027                                  le64toh(f->header->n_entries),
2028                                  p,
2029                                  test_object_offset,
2030                                  DIRECTION_DOWN,
2031                                  NULL, NULL,
2032                                  &i);
2033         if (r <= 0)
2034                 return r;
2035
2036         /* Calculate new index */
2037         if (skip < 0) {
2038                 if ((uint64_t) -skip >= i)
2039                         i = 0;
2040                 else
2041                         i = i - (uint64_t) -skip;
2042         } else
2043                 i  += (uint64_t) skip;
2044
2045         n = le64toh(f->header->n_entries);
2046         if (n <= 0)
2047                 return -EBADMSG;
2048
2049         if (i >= n)
2050                 i = n-1;
2051
2052         return generic_array_get(f,
2053                                  le64toh(f->header->entry_array_offset),
2054                                  i,
2055                                  ret, offset);
2056 }
2057
2058 int journal_file_next_entry_for_data(
2059                 JournalFile *f,
2060                 Object *o, uint64_t p,
2061                 uint64_t data_offset,
2062                 direction_t direction,
2063                 Object **ret, uint64_t *offset) {
2064
2065         uint64_t n, i;
2066         int r;
2067         Object *d;
2068
2069         assert(f);
2070         assert(p > 0 || !o);
2071
2072         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2073         if (r < 0)
2074                 return r;
2075
2076         n = le64toh(d->data.n_entries);
2077         if (n <= 0)
2078                 return n;
2079
2080         if (!o)
2081                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2082         else {
2083                 if (o->object.type != OBJECT_ENTRY)
2084                         return -EINVAL;
2085
2086                 r = generic_array_bisect_plus_one(f,
2087                                                   le64toh(d->data.entry_offset),
2088                                                   le64toh(d->data.entry_array_offset),
2089                                                   le64toh(d->data.n_entries),
2090                                                   p,
2091                                                   test_object_offset,
2092                                                   DIRECTION_DOWN,
2093                                                   NULL, NULL,
2094                                                   &i);
2095
2096                 if (r <= 0)
2097                         return r;
2098
2099                 if (direction == DIRECTION_DOWN) {
2100                         if (i >= n - 1)
2101                                 return 0;
2102
2103                         i++;
2104                 } else {
2105                         if (i <= 0)
2106                                 return 0;
2107
2108                         i--;
2109                 }
2110
2111         }
2112
2113         return generic_array_get_plus_one(f,
2114                                           le64toh(d->data.entry_offset),
2115                                           le64toh(d->data.entry_array_offset),
2116                                           i,
2117                                           ret, offset);
2118 }
2119
2120 int journal_file_move_to_entry_by_offset_for_data(
2121                 JournalFile *f,
2122                 uint64_t data_offset,
2123                 uint64_t p,
2124                 direction_t direction,
2125                 Object **ret, uint64_t *offset) {
2126
2127         int r;
2128         Object *d;
2129
2130         assert(f);
2131
2132         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2133         if (r < 0)
2134                 return r;
2135
2136         return generic_array_bisect_plus_one(f,
2137                                              le64toh(d->data.entry_offset),
2138                                              le64toh(d->data.entry_array_offset),
2139                                              le64toh(d->data.n_entries),
2140                                              p,
2141                                              test_object_offset,
2142                                              direction,
2143                                              ret, offset, NULL);
2144 }
2145
2146 int journal_file_move_to_entry_by_monotonic_for_data(
2147                 JournalFile *f,
2148                 uint64_t data_offset,
2149                 sd_id128_t boot_id,
2150                 uint64_t monotonic,
2151                 direction_t direction,
2152                 Object **ret, uint64_t *offset) {
2153
2154         Object *o, *d;
2155         int r;
2156         uint64_t b, z;
2157
2158         assert(f);
2159
2160         /* First, seek by time */
2161         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2162         if (r < 0)
2163                 return r;
2164         if (r == 0)
2165                 return -ENOENT;
2166
2167         r = generic_array_bisect_plus_one(f,
2168                                           le64toh(o->data.entry_offset),
2169                                           le64toh(o->data.entry_array_offset),
2170                                           le64toh(o->data.n_entries),
2171                                           monotonic,
2172                                           test_object_monotonic,
2173                                           direction,
2174                                           NULL, &z, NULL);
2175         if (r <= 0)
2176                 return r;
2177
2178         /* And now, continue seeking until we find an entry that
2179          * exists in both bisection arrays */
2180
2181         for (;;) {
2182                 Object *qo;
2183                 uint64_t p, q;
2184
2185                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2186                 if (r < 0)
2187                         return r;
2188
2189                 r = generic_array_bisect_plus_one(f,
2190                                                   le64toh(d->data.entry_offset),
2191                                                   le64toh(d->data.entry_array_offset),
2192                                                   le64toh(d->data.n_entries),
2193                                                   z,
2194                                                   test_object_offset,
2195                                                   direction,
2196                                                   NULL, &p, NULL);
2197                 if (r <= 0)
2198                         return r;
2199
2200                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2201                 if (r < 0)
2202                         return r;
2203
2204                 r = generic_array_bisect_plus_one(f,
2205                                                   le64toh(o->data.entry_offset),
2206                                                   le64toh(o->data.entry_array_offset),
2207                                                   le64toh(o->data.n_entries),
2208                                                   p,
2209                                                   test_object_offset,
2210                                                   direction,
2211                                                   &qo, &q, NULL);
2212
2213                 if (r <= 0)
2214                         return r;
2215
2216                 if (p == q) {
2217                         if (ret)
2218                                 *ret = qo;
2219                         if (offset)
2220                                 *offset = q;
2221
2222                         return 1;
2223                 }
2224
2225                 z = q;
2226         }
2227 }
2228
2229 int journal_file_move_to_entry_by_seqnum_for_data(
2230                 JournalFile *f,
2231                 uint64_t data_offset,
2232                 uint64_t seqnum,
2233                 direction_t direction,
2234                 Object **ret, uint64_t *offset) {
2235
2236         Object *d;
2237         int r;
2238
2239         assert(f);
2240
2241         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2242         if (r < 0)
2243                 return r;
2244
2245         return generic_array_bisect_plus_one(f,
2246                                              le64toh(d->data.entry_offset),
2247                                              le64toh(d->data.entry_array_offset),
2248                                              le64toh(d->data.n_entries),
2249                                              seqnum,
2250                                              test_object_seqnum,
2251                                              direction,
2252                                              ret, offset, NULL);
2253 }
2254
2255 int journal_file_move_to_entry_by_realtime_for_data(
2256                 JournalFile *f,
2257                 uint64_t data_offset,
2258                 uint64_t realtime,
2259                 direction_t direction,
2260                 Object **ret, uint64_t *offset) {
2261
2262         Object *d;
2263         int r;
2264
2265         assert(f);
2266
2267         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2268         if (r < 0)
2269                 return r;
2270
2271         return generic_array_bisect_plus_one(f,
2272                                              le64toh(d->data.entry_offset),
2273                                              le64toh(d->data.entry_array_offset),
2274                                              le64toh(d->data.n_entries),
2275                                              realtime,
2276                                              test_object_realtime,
2277                                              direction,
2278                                              ret, offset, NULL);
2279 }
2280
2281 void journal_file_dump(JournalFile *f) {
2282         Object *o;
2283         int r;
2284         uint64_t p;
2285
2286         assert(f);
2287
2288         journal_file_print_header(f);
2289
2290         p = le64toh(f->header->header_size);
2291         while (p != 0) {
2292                 r = journal_file_move_to_object(f, -1, p, &o);
2293                 if (r < 0)
2294                         goto fail;
2295
2296                 switch (o->object.type) {
2297
2298                 case OBJECT_UNUSED:
2299                         printf("Type: OBJECT_UNUSED\n");
2300                         break;
2301
2302                 case OBJECT_DATA:
2303                         printf("Type: OBJECT_DATA\n");
2304                         break;
2305
2306                 case OBJECT_FIELD:
2307                         printf("Type: OBJECT_FIELD\n");
2308                         break;
2309
2310                 case OBJECT_ENTRY:
2311                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2312                                le64toh(o->entry.seqnum),
2313                                le64toh(o->entry.monotonic),
2314                                le64toh(o->entry.realtime));
2315                         break;
2316
2317                 case OBJECT_FIELD_HASH_TABLE:
2318                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2319                         break;
2320
2321                 case OBJECT_DATA_HASH_TABLE:
2322                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2323                         break;
2324
2325                 case OBJECT_ENTRY_ARRAY:
2326                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2327                         break;
2328
2329                 case OBJECT_TAG:
2330                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2331                                le64toh(o->tag.seqnum),
2332                                le64toh(o->tag.epoch));
2333                         break;
2334
2335                 default:
2336                         printf("Type: unknown (%u)\n", o->object.type);
2337                         break;
2338                 }
2339
2340                 if (o->object.flags & OBJECT_COMPRESSED)
2341                         printf("Flags: COMPRESSED\n");
2342
2343                 if (p == le64toh(f->header->tail_object_offset))
2344                         p = 0;
2345                 else
2346                         p = p + ALIGN64(le64toh(o->object.size));
2347         }
2348
2349         return;
2350 fail:
2351         log_error("File corrupt");
2352 }
2353
2354 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2355         const char *x;
2356
2357         x = format_timestamp(buf, l, t);
2358         if (x)
2359                 return x;
2360         return " --- ";
2361 }
2362
2363 void journal_file_print_header(JournalFile *f) {
2364         char a[33], b[33], c[33], d[33];
2365         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2366         struct stat st;
2367         char bytes[FORMAT_BYTES_MAX];
2368
2369         assert(f);
2370
2371         printf("File Path: %s\n"
2372                "File ID: %s\n"
2373                "Machine ID: %s\n"
2374                "Boot ID: %s\n"
2375                "Sequential Number ID: %s\n"
2376                "State: %s\n"
2377                "Compatible Flags:%s%s\n"
2378                "Incompatible Flags:%s%s\n"
2379                "Header size: %"PRIu64"\n"
2380                "Arena size: %"PRIu64"\n"
2381                "Data Hash Table Size: %"PRIu64"\n"
2382                "Field Hash Table Size: %"PRIu64"\n"
2383                "Rotate Suggested: %s\n"
2384                "Head Sequential Number: %"PRIu64"\n"
2385                "Tail Sequential Number: %"PRIu64"\n"
2386                "Head Realtime Timestamp: %s\n"
2387                "Tail Realtime Timestamp: %s\n"
2388                "Tail Monotonic Timestamp: %s\n"
2389                "Objects: %"PRIu64"\n"
2390                "Entry Objects: %"PRIu64"\n",
2391                f->path,
2392                sd_id128_to_string(f->header->file_id, a),
2393                sd_id128_to_string(f->header->machine_id, b),
2394                sd_id128_to_string(f->header->boot_id, c),
2395                sd_id128_to_string(f->header->seqnum_id, d),
2396                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2397                f->header->state == STATE_ONLINE ? "ONLINE" :
2398                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2399                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2400                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2401                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2402                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2403                le64toh(f->header->header_size),
2404                le64toh(f->header->arena_size),
2405                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2406                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2407                yes_no(journal_file_rotate_suggested(f, 0)),
2408                le64toh(f->header->head_entry_seqnum),
2409                le64toh(f->header->tail_entry_seqnum),
2410                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2411                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2412                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2413                le64toh(f->header->n_objects),
2414                le64toh(f->header->n_entries));
2415
2416         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2417                 printf("Data Objects: %"PRIu64"\n"
2418                        "Data Hash Table Fill: %.1f%%\n",
2419                        le64toh(f->header->n_data),
2420                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2421
2422         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2423                 printf("Field Objects: %"PRIu64"\n"
2424                        "Field Hash Table Fill: %.1f%%\n",
2425                        le64toh(f->header->n_fields),
2426                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2427
2428         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2429                 printf("Tag Objects: %"PRIu64"\n",
2430                        le64toh(f->header->n_tags));
2431         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2432                 printf("Entry Array Objects: %"PRIu64"\n",
2433                        le64toh(f->header->n_entry_arrays));
2434
2435         if (fstat(f->fd, &st) >= 0)
2436                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2437 }
2438
2439 int journal_file_open(
2440                 const char *fname,
2441                 int flags,
2442                 mode_t mode,
2443                 bool compress,
2444                 bool seal,
2445                 JournalMetrics *metrics,
2446                 MMapCache *mmap_cache,
2447                 JournalFile *template,
2448                 JournalFile **ret) {
2449
2450         JournalFile *f;
2451         int r;
2452         bool newly_created = false;
2453
2454         assert(fname);
2455         assert(ret);
2456
2457         if ((flags & O_ACCMODE) != O_RDONLY &&
2458             (flags & O_ACCMODE) != O_RDWR)
2459                 return -EINVAL;
2460
2461         if (!endswith(fname, ".journal") &&
2462             !endswith(fname, ".journal~"))
2463                 return -EINVAL;
2464
2465         f = new0(JournalFile, 1);
2466         if (!f)
2467                 return -ENOMEM;
2468
2469         f->fd = -1;
2470         f->mode = mode;
2471
2472         f->flags = flags;
2473         f->prot = prot_from_flags(flags);
2474         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2475 #ifdef HAVE_XZ
2476         f->compress = compress;
2477 #endif
2478 #ifdef HAVE_GCRYPT
2479         f->seal = seal;
2480 #endif
2481
2482         if (mmap_cache)
2483                 f->mmap = mmap_cache_ref(mmap_cache);
2484         else {
2485                 f->mmap = mmap_cache_new();
2486                 if (!f->mmap) {
2487                         r = -ENOMEM;
2488                         goto fail;
2489                 }
2490         }
2491
2492         f->path = strdup(fname);
2493         if (!f->path) {
2494                 r = -ENOMEM;
2495                 goto fail;
2496         }
2497
2498         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2499         if (!f->chain_cache) {
2500                 r = -ENOMEM;
2501                 goto fail;
2502         }
2503
2504         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2505         if (f->fd < 0) {
2506                 r = -errno;
2507                 goto fail;
2508         }
2509
2510         if (fstat(f->fd, &f->last_stat) < 0) {
2511                 r = -errno;
2512                 goto fail;
2513         }
2514
2515         if (f->last_stat.st_size == 0 && f->writable) {
2516 #ifdef HAVE_XATTR
2517                 uint64_t crtime;
2518
2519                 /* Let's attach the creation time to the journal file,
2520                  * so that the vacuuming code knows the age of this
2521                  * file even if the file might end up corrupted one
2522                  * day... Ideally we'd just use the creation time many
2523                  * file systems maintain for each file, but there is
2524                  * currently no usable API to query this, hence let's
2525                  * emulate this via extended attributes. If extended
2526                  * attributes are not supported we'll just skip this,
2527                  * and rely solely on mtime/atime/ctime of the file.*/
2528
2529                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2530                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2531 #endif
2532
2533 #ifdef HAVE_GCRYPT
2534                 /* Try to load the FSPRG state, and if we can't, then
2535                  * just don't do sealing */
2536                 if (f->seal) {
2537                         r = journal_file_fss_load(f);
2538                         if (r < 0)
2539                                 f->seal = false;
2540                 }
2541 #endif
2542
2543                 r = journal_file_init_header(f, template);
2544                 if (r < 0)
2545                         goto fail;
2546
2547                 if (fstat(f->fd, &f->last_stat) < 0) {
2548                         r = -errno;
2549                         goto fail;
2550                 }
2551
2552                 newly_created = true;
2553         }
2554
2555         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2556                 r = -EIO;
2557                 goto fail;
2558         }
2559
2560         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2561         if (f->header == MAP_FAILED) {
2562                 f->header = NULL;
2563                 r = -errno;
2564                 goto fail;
2565         }
2566
2567         if (!newly_created) {
2568                 r = journal_file_verify_header(f);
2569                 if (r < 0)
2570                         goto fail;
2571         }
2572
2573 #ifdef HAVE_GCRYPT
2574         if (!newly_created && f->writable) {
2575                 r = journal_file_fss_load(f);
2576                 if (r < 0)
2577                         goto fail;
2578         }
2579 #endif
2580
2581         if (f->writable) {
2582                 if (metrics) {
2583                         journal_default_metrics(metrics, f->fd);
2584                         f->metrics = *metrics;
2585                 } else if (template)
2586                         f->metrics = template->metrics;
2587
2588                 r = journal_file_refresh_header(f);
2589                 if (r < 0)
2590                         goto fail;
2591         }
2592
2593 #ifdef HAVE_GCRYPT
2594         r = journal_file_hmac_setup(f);
2595         if (r < 0)
2596                 goto fail;
2597 #endif
2598
2599         if (newly_created) {
2600                 r = journal_file_setup_field_hash_table(f);
2601                 if (r < 0)
2602                         goto fail;
2603
2604                 r = journal_file_setup_data_hash_table(f);
2605                 if (r < 0)
2606                         goto fail;
2607
2608 #ifdef HAVE_GCRYPT
2609                 r = journal_file_append_first_tag(f);
2610                 if (r < 0)
2611                         goto fail;
2612 #endif
2613         }
2614
2615         r = journal_file_map_field_hash_table(f);
2616         if (r < 0)
2617                 goto fail;
2618
2619         r = journal_file_map_data_hash_table(f);
2620         if (r < 0)
2621                 goto fail;
2622
2623         *ret = f;
2624         return 0;
2625
2626 fail:
2627         journal_file_close(f);
2628
2629         return r;
2630 }
2631
2632 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2633         _cleanup_free_ char *p = NULL;
2634         size_t l;
2635         JournalFile *old_file, *new_file = NULL;
2636         int r;
2637
2638         assert(f);
2639         assert(*f);
2640
2641         old_file = *f;
2642
2643         if (!old_file->writable)
2644                 return -EINVAL;
2645
2646         if (!endswith(old_file->path, ".journal"))
2647                 return -EINVAL;
2648
2649         l = strlen(old_file->path);
2650         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2651                      (int) l - 8, old_file->path,
2652                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2653                      le64toh((*f)->header->head_entry_seqnum),
2654                      le64toh((*f)->header->head_entry_realtime));
2655         if (r < 0)
2656                 return -ENOMEM;
2657
2658         r = rename(old_file->path, p);
2659         if (r < 0)
2660                 return -errno;
2661
2662         old_file->header->state = STATE_ARCHIVED;
2663
2664         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2665         journal_file_close(old_file);
2666
2667         *f = new_file;
2668         return r;
2669 }
2670
2671 int journal_file_open_reliably(
2672                 const char *fname,
2673                 int flags,
2674                 mode_t mode,
2675                 bool compress,
2676                 bool seal,
2677                 JournalMetrics *metrics,
2678                 MMapCache *mmap_cache,
2679                 JournalFile *template,
2680                 JournalFile **ret) {
2681
2682         int r;
2683         size_t l;
2684         _cleanup_free_ char *p = NULL;
2685
2686         r = journal_file_open(fname, flags, mode, compress, seal,
2687                               metrics, mmap_cache, template, ret);
2688         if (r != -EBADMSG && /* corrupted */
2689             r != -ENODATA && /* truncated */
2690             r != -EHOSTDOWN && /* other machine */
2691             r != -EPROTONOSUPPORT && /* incompatible feature */
2692             r != -EBUSY && /* unclean shutdown */
2693             r != -ESHUTDOWN /* already archived */)
2694                 return r;
2695
2696         if ((flags & O_ACCMODE) == O_RDONLY)
2697                 return r;
2698
2699         if (!(flags & O_CREAT))
2700                 return r;
2701
2702         if (!endswith(fname, ".journal"))
2703                 return r;
2704
2705         /* The file is corrupted. Rotate it away and try it again (but only once) */
2706
2707         l = strlen(fname);
2708         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2709                      (int) l - 8, fname,
2710                      (unsigned long long) now(CLOCK_REALTIME),
2711                      random_u64()) < 0)
2712                 return -ENOMEM;
2713
2714         r = rename(fname, p);
2715         if (r < 0)
2716                 return -errno;
2717
2718         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2719
2720         return journal_file_open(fname, flags, mode, compress, seal,
2721                                  metrics, mmap_cache, template, ret);
2722 }
2723
2724 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2725         uint64_t i, n;
2726         uint64_t q, xor_hash = 0;
2727         int r;
2728         EntryItem *items;
2729         dual_timestamp ts;
2730
2731         assert(from);
2732         assert(to);
2733         assert(o);
2734         assert(p);
2735
2736         if (!to->writable)
2737                 return -EPERM;
2738
2739         ts.monotonic = le64toh(o->entry.monotonic);
2740         ts.realtime = le64toh(o->entry.realtime);
2741
2742         n = journal_file_entry_n_items(o);
2743         /* alloca() can't take 0, hence let's allocate at least one */
2744         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2745
2746         for (i = 0; i < n; i++) {
2747                 uint64_t l, h;
2748                 le64_t le_hash;
2749                 size_t t;
2750                 void *data;
2751                 Object *u;
2752
2753                 q = le64toh(o->entry.items[i].object_offset);
2754                 le_hash = o->entry.items[i].hash;
2755
2756                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2757                 if (r < 0)
2758                         return r;
2759
2760                 if (le_hash != o->data.hash)
2761                         return -EBADMSG;
2762
2763                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2764                 t = (size_t) l;
2765
2766                 /* We hit the limit on 32bit machines */
2767                 if ((uint64_t) t != l)
2768                         return -E2BIG;
2769
2770                 if (o->object.flags & OBJECT_COMPRESSED) {
2771 #ifdef HAVE_XZ
2772                         uint64_t rsize;
2773
2774                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2775                                 return -EBADMSG;
2776
2777                         data = from->compress_buffer;
2778                         l = rsize;
2779 #else
2780                         return -EPROTONOSUPPORT;
2781 #endif
2782                 } else
2783                         data = o->data.payload;
2784
2785                 r = journal_file_append_data(to, data, l, &u, &h);
2786                 if (r < 0)
2787                         return r;
2788
2789                 xor_hash ^= le64toh(u->data.hash);
2790                 items[i].object_offset = htole64(h);
2791                 items[i].hash = u->data.hash;
2792
2793                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2794                 if (r < 0)
2795                         return r;
2796         }
2797
2798         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2799 }
2800
2801 void journal_default_metrics(JournalMetrics *m, int fd) {
2802         uint64_t fs_size = 0;
2803         struct statvfs ss;
2804         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2805
2806         assert(m);
2807         assert(fd >= 0);
2808
2809         if (fstatvfs(fd, &ss) >= 0)
2810                 fs_size = ss.f_frsize * ss.f_blocks;
2811
2812         if (m->max_use == (uint64_t) -1) {
2813
2814                 if (fs_size > 0) {
2815                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2816
2817                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2818                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2819
2820                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2821                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2822                 } else
2823                         m->max_use = DEFAULT_MAX_USE_LOWER;
2824         } else {
2825                 m->max_use = PAGE_ALIGN(m->max_use);
2826
2827                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2828                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2829         }
2830
2831         if (m->max_size == (uint64_t) -1) {
2832                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2833
2834                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2835                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2836         } else
2837                 m->max_size = PAGE_ALIGN(m->max_size);
2838
2839         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2840                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2841
2842         if (m->max_size*2 > m->max_use)
2843                 m->max_use = m->max_size*2;
2844
2845         if (m->min_size == (uint64_t) -1)
2846                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2847         else {
2848                 m->min_size = PAGE_ALIGN(m->min_size);
2849
2850                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2851                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2852
2853                 if (m->min_size > m->max_size)
2854                         m->max_size = m->min_size;
2855         }
2856
2857         if (m->keep_free == (uint64_t) -1) {
2858
2859                 if (fs_size > 0) {
2860                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2861
2862                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2863                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2864
2865                 } else
2866                         m->keep_free = DEFAULT_KEEP_FREE;
2867         }
2868
2869         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2870                   format_bytes(a, sizeof(a), m->max_use),
2871                   format_bytes(b, sizeof(b), m->max_size),
2872                   format_bytes(c, sizeof(c), m->min_size),
2873                   format_bytes(d, sizeof(d), m->keep_free));
2874 }
2875
2876 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2877         assert(f);
2878         assert(from || to);
2879
2880         if (from) {
2881                 if (f->header->head_entry_realtime == 0)
2882                         return -ENOENT;
2883
2884                 *from = le64toh(f->header->head_entry_realtime);
2885         }
2886
2887         if (to) {
2888                 if (f->header->tail_entry_realtime == 0)
2889                         return -ENOENT;
2890
2891                 *to = le64toh(f->header->tail_entry_realtime);
2892         }
2893
2894         return 1;
2895 }
2896
2897 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2898         Object *o;
2899         uint64_t p;
2900         int r;
2901
2902         assert(f);
2903         assert(from || to);
2904
2905         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2906         if (r <= 0)
2907                 return r;
2908
2909         if (le64toh(o->data.n_entries) <= 0)
2910                 return 0;
2911
2912         if (from) {
2913                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2914                 if (r < 0)
2915                         return r;
2916
2917                 *from = le64toh(o->entry.monotonic);
2918         }
2919
2920         if (to) {
2921                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2922                 if (r < 0)
2923                         return r;
2924
2925                 r = generic_array_get_plus_one(f,
2926                                                le64toh(o->data.entry_offset),
2927                                                le64toh(o->data.entry_array_offset),
2928                                                le64toh(o->data.n_entries)-1,
2929                                                &o, NULL);
2930                 if (r <= 0)
2931                         return r;
2932
2933                 *to = le64toh(o->entry.monotonic);
2934         }
2935
2936         return 1;
2937 }
2938
2939 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2940         assert(f);
2941
2942         /* If we gained new header fields we gained new features,
2943          * hence suggest a rotation */
2944         if (le64toh(f->header->header_size) < sizeof(Header)) {
2945                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2946                 return true;
2947         }
2948
2949         /* Let's check if the hash tables grew over a certain fill
2950          * level (75%, borrowing this value from Java's hash table
2951          * implementation), and if so suggest a rotation. To calculate
2952          * the fill level we need the n_data field, which only exists
2953          * in newer versions. */
2954
2955         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2956                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2957                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2958                                   f->path,
2959                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2960                                   le64toh(f->header->n_data),
2961                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2962                                   (unsigned long long) f->last_stat.st_size,
2963                                   f->last_stat.st_size / le64toh(f->header->n_data));
2964                         return true;
2965                 }
2966
2967         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2968                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2969                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2970                                   f->path,
2971                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2972                                   le64toh(f->header->n_fields),
2973                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2974                         return true;
2975                 }
2976
2977         /* Are the data objects properly indexed by field objects? */
2978         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2979             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2980             le64toh(f->header->n_data) > 0 &&
2981             le64toh(f->header->n_fields) == 0)
2982                 return true;
2983
2984         if (max_file_usec > 0) {
2985                 usec_t t, h;
2986
2987                 h = le64toh(f->header->head_entry_realtime);
2988                 t = now(CLOCK_REALTIME);
2989
2990                 if (h > 0 && t > h + max_file_usec)
2991                         return true;
2992         }
2993
2994         return false;
2995 }