chiark / gitweb /
5876733598c46326134463eb6dea3e2a9d5790d2
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75         assert(f);
76
77         if (!f->writable)
78                 return -EPERM;
79
80         if (!(f->fd >= 0 && f->header))
81                 return -EINVAL;
82
83         switch(f->header->state) {
84                 case STATE_ONLINE:
85                         return 0;
86
87                 case STATE_OFFLINE:
88                         f->header->state = STATE_ONLINE;
89                         fsync(f->fd);
90                         return 0;
91
92                 default:
93                         return -EINVAL;
94         }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98         assert(f);
99
100         if (!f->writable)
101                 return -EPERM;
102
103         if (!(f->fd >= 0 && f->header))
104                 return -EINVAL;
105
106         if (f->header->state != STATE_ONLINE)
107                 return 0;
108
109         fsync(f->fd);
110
111         f->header->state = STATE_OFFLINE;
112
113         fsync(f->fd);
114
115         return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119         assert(f);
120
121 #ifdef HAVE_GCRYPT
122         /* Write the final tag */
123         if (f->seal && f->writable)
124                 journal_file_append_tag(f);
125 #endif
126
127         /* Sync everything to disk, before we mark the file offline */
128         if (f->mmap && f->fd >= 0)
129                 mmap_cache_close_fd(f->mmap, f->fd);
130
131         journal_file_set_offline(f);
132
133         if (f->header)
134                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136         if (f->fd >= 0)
137                 close_nointr_nofail(f->fd);
138
139         free(f->path);
140
141         if (f->mmap)
142                 mmap_cache_unref(f->mmap);
143
144         hashmap_free_free(f->chain_cache);
145
146 #ifdef HAVE_XZ
147         free(f->compress_buffer);
148 #endif
149
150 #ifdef HAVE_GCRYPT
151         if (f->fss_file)
152                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153         else if (f->fsprg_state)
154                 free(f->fsprg_state);
155
156         free(f->fsprg_seed);
157
158         if (f->hmac)
159                 gcry_md_close(f->hmac);
160 #endif
161
162         free(f);
163 }
164
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166         Header h;
167         ssize_t k;
168         int r;
169
170         assert(f);
171
172         zero(h);
173         memcpy(h.signature, HEADER_SIGNATURE, 8);
174         h.header_size = htole64(ALIGN64(sizeof(h)));
175
176         h.incompatible_flags =
177                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179         h.compatible_flags =
180                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
181
182         r = sd_id128_randomize(&h.file_id);
183         if (r < 0)
184                 return r;
185
186         if (template) {
187                 h.seqnum_id = template->header->seqnum_id;
188                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
189         } else
190                 h.seqnum_id = h.file_id;
191
192         k = pwrite(f->fd, &h, sizeof(h), 0);
193         if (k < 0)
194                 return -errno;
195
196         if (k != sizeof(h))
197                 return -EIO;
198
199         return 0;
200 }
201
202 static int journal_file_refresh_header(JournalFile *f) {
203         int r;
204         sd_id128_t boot_id;
205
206         assert(f);
207
208         r = sd_id128_get_machine(&f->header->machine_id);
209         if (r < 0)
210                 return r;
211
212         r = sd_id128_get_boot(&boot_id);
213         if (r < 0)
214                 return r;
215
216         if (sd_id128_equal(boot_id, f->header->boot_id))
217                 f->tail_entry_monotonic_valid = true;
218
219         f->header->boot_id = boot_id;
220
221         journal_file_set_online(f);
222
223         /* Sync the online state to disk */
224         fsync(f->fd);
225
226         return 0;
227 }
228
229 static int journal_file_verify_header(JournalFile *f) {
230         assert(f);
231
232         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233                 return -EBADMSG;
234
235         /* In both read and write mode we refuse to open files with
236          * incompatible flags we don't know */
237 #ifdef HAVE_XZ
238         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239                 return -EPROTONOSUPPORT;
240 #else
241         if (f->header->incompatible_flags != 0)
242                 return -EPROTONOSUPPORT;
243 #endif
244
245         /* When open for writing we refuse to open files with
246          * compatible flags, too */
247         if (f->writable) {
248 #ifdef HAVE_GCRYPT
249                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250                         return -EPROTONOSUPPORT;
251 #else
252                 if (f->header->compatible_flags != 0)
253                         return -EPROTONOSUPPORT;
254 #endif
255         }
256
257         if (f->header->state >= _STATE_MAX)
258                 return -EBADMSG;
259
260         /* The first addition was n_data, so check that we are at least this large */
261         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262                 return -EBADMSG;
263
264         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265                 return -EBADMSG;
266
267         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268                 return -ENODATA;
269
270         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271                 return -ENODATA;
272
273         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275             !VALID64(le64toh(f->header->tail_object_offset)) ||
276             !VALID64(le64toh(f->header->entry_array_offset)))
277                 return -ENODATA;
278
279         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
283                 return -ENODATA;
284
285         if (f->writable) {
286                 uint8_t state;
287                 sd_id128_t machine_id;
288                 int r;
289
290                 r = sd_id128_get_machine(&machine_id);
291                 if (r < 0)
292                         return r;
293
294                 if (!sd_id128_equal(machine_id, f->header->machine_id))
295                         return -EHOSTDOWN;
296
297                 state = f->header->state;
298
299                 if (state == STATE_ONLINE) {
300                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301                         return -EBUSY;
302                 } else if (state == STATE_ARCHIVED)
303                         return -ESHUTDOWN;
304                 else if (state != STATE_OFFLINE) {
305                         log_debug("Journal file %s has unknown state %u.", f->path, state);
306                         return -EBUSY;
307                 }
308         }
309
310         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
311
312         f->seal = JOURNAL_HEADER_SEALED(f->header);
313
314         return 0;
315 }
316
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318         uint64_t old_size, new_size;
319         int r;
320
321         assert(f);
322
323         /* We assume that this file is not sparse, and we know that
324          * for sure, since we always call posix_fallocate()
325          * ourselves */
326
327         old_size =
328                 le64toh(f->header->header_size) +
329                 le64toh(f->header->arena_size);
330
331         new_size = PAGE_ALIGN(offset + size);
332         if (new_size < le64toh(f->header->header_size))
333                 new_size = le64toh(f->header->header_size);
334
335         if (new_size <= old_size)
336                 return 0;
337
338         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339                 return -E2BIG;
340
341         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Increase by larger blocks at once */
360         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362                 new_size = f->metrics.max_size;
363
364         /* Note that the glibc fallocate() fallback is very
365            inefficient, hence we try to minimize the allocation area
366            as we can. */
367         r = posix_fallocate(f->fd, old_size, new_size - old_size);
368         if (r != 0)
369                 return -r;
370
371         if (fstat(f->fd, &f->last_stat) < 0)
372                 return -errno;
373
374         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
375
376         return 0;
377 }
378
379 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
380         assert(f);
381         assert(ret);
382
383         if (size <= 0)
384                 return -EINVAL;
385
386         /* Avoid SIGBUS on invalid accesses */
387         if (offset + size > (uint64_t) f->last_stat.st_size) {
388                 /* Hmm, out of range? Let's refresh the fstat() data
389                  * first, before we trust that check. */
390
391                 if (fstat(f->fd, &f->last_stat) < 0 ||
392                     offset + size > (uint64_t) f->last_stat.st_size)
393                         return -EADDRNOTAVAIL;
394         }
395
396         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 }
398
399 static uint64_t minimum_header_size(Object *o) {
400
401         static const uint64_t table[] = {
402                 [OBJECT_DATA] = sizeof(DataObject),
403                 [OBJECT_FIELD] = sizeof(FieldObject),
404                 [OBJECT_ENTRY] = sizeof(EntryObject),
405                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408                 [OBJECT_TAG] = sizeof(TagObject),
409         };
410
411         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412                 return sizeof(ObjectHeader);
413
414         return table[o->object.type];
415 }
416
417 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
418         int r;
419         void *t;
420         Object *o;
421         uint64_t s;
422
423         assert(f);
424         assert(ret);
425
426         /* Objects may only be located at multiple of 64 bit */
427         if (!VALID64(offset))
428                 return -EFAULT;
429
430
431         r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
432         if (r < 0)
433                 return r;
434
435         o = (Object*) t;
436         s = le64toh(o->object.size);
437
438         if (s < sizeof(ObjectHeader))
439                 return -EBADMSG;
440
441         if (o->object.type <= OBJECT_UNUSED)
442                 return -EBADMSG;
443
444         if (s < minimum_header_size(o))
445                 return -EBADMSG;
446
447         if (type > 0 && o->object.type != type)
448                 return -EBADMSG;
449
450         if (s > sizeof(ObjectHeader)) {
451                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
452                 if (r < 0)
453                         return r;
454
455                 o = (Object*) t;
456         }
457
458         *ret = o;
459         return 0;
460 }
461
462 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
463         uint64_t r;
464
465         assert(f);
466
467         r = le64toh(f->header->tail_entry_seqnum) + 1;
468
469         if (seqnum) {
470                 /* If an external seqnum counter was passed, we update
471                  * both the local and the external one, and set it to
472                  * the maximum of both */
473
474                 if (*seqnum + 1 > r)
475                         r = *seqnum + 1;
476
477                 *seqnum = r;
478         }
479
480         f->header->tail_entry_seqnum = htole64(r);
481
482         if (f->header->head_entry_seqnum == 0)
483                 f->header->head_entry_seqnum = htole64(r);
484
485         return r;
486 }
487
488 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
489         int r;
490         uint64_t p;
491         Object *tail, *o;
492         void *t;
493
494         assert(f);
495         assert(type > 0 && type < _OBJECT_TYPE_MAX);
496         assert(size >= sizeof(ObjectHeader));
497         assert(offset);
498         assert(ret);
499
500         r = journal_file_set_online(f);
501         if (r < 0)
502                 return r;
503
504         p = le64toh(f->header->tail_object_offset);
505         if (p == 0)
506                 p = le64toh(f->header->header_size);
507         else {
508                 r = journal_file_move_to_object(f, -1, p, &tail);
509                 if (r < 0)
510                         return r;
511
512                 p += ALIGN64(le64toh(tail->object.size));
513         }
514
515         r = journal_file_allocate(f, p, size);
516         if (r < 0)
517                 return r;
518
519         r = journal_file_move_to(f, type, false, p, size, &t);
520         if (r < 0)
521                 return r;
522
523         o = (Object*) t;
524
525         zero(o->object);
526         o->object.type = type;
527         o->object.size = htole64(size);
528
529         f->header->tail_object_offset = htole64(p);
530         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
531
532         *ret = o;
533         *offset = p;
534
535         return 0;
536 }
537
538 static int journal_file_setup_data_hash_table(JournalFile *f) {
539         uint64_t s, p;
540         Object *o;
541         int r;
542
543         assert(f);
544
545         /* We estimate that we need 1 hash table entry per 768 of
546            journal file and we want to make sure we never get beyond
547            75% fill level. Calculate the hash table size for the
548            maximum file size based on these metrics. */
549
550         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
551         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
552                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
553
554         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
555
556         r = journal_file_append_object(f,
557                                        OBJECT_DATA_HASH_TABLE,
558                                        offsetof(Object, hash_table.items) + s,
559                                        &o, &p);
560         if (r < 0)
561                 return r;
562
563         memzero(o->hash_table.items, s);
564
565         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
566         f->header->data_hash_table_size = htole64(s);
567
568         return 0;
569 }
570
571 static int journal_file_setup_field_hash_table(JournalFile *f) {
572         uint64_t s, p;
573         Object *o;
574         int r;
575
576         assert(f);
577
578         /* We use a fixed size hash table for the fields as this
579          * number should grow very slowly only */
580
581         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
582         r = journal_file_append_object(f,
583                                        OBJECT_FIELD_HASH_TABLE,
584                                        offsetof(Object, hash_table.items) + s,
585                                        &o, &p);
586         if (r < 0)
587                 return r;
588
589         memzero(o->hash_table.items, s);
590
591         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
592         f->header->field_hash_table_size = htole64(s);
593
594         return 0;
595 }
596
597 static int journal_file_map_data_hash_table(JournalFile *f) {
598         uint64_t s, p;
599         void *t;
600         int r;
601
602         assert(f);
603
604         p = le64toh(f->header->data_hash_table_offset);
605         s = le64toh(f->header->data_hash_table_size);
606
607         r = journal_file_move_to(f,
608                                  OBJECT_DATA_HASH_TABLE,
609                                  true,
610                                  p, s,
611                                  &t);
612         if (r < 0)
613                 return r;
614
615         f->data_hash_table = t;
616         return 0;
617 }
618
619 static int journal_file_map_field_hash_table(JournalFile *f) {
620         uint64_t s, p;
621         void *t;
622         int r;
623
624         assert(f);
625
626         p = le64toh(f->header->field_hash_table_offset);
627         s = le64toh(f->header->field_hash_table_size);
628
629         r = journal_file_move_to(f,
630                                  OBJECT_FIELD_HASH_TABLE,
631                                  true,
632                                  p, s,
633                                  &t);
634         if (r < 0)
635                 return r;
636
637         f->field_hash_table = t;
638         return 0;
639 }
640
641 static int journal_file_link_field(
642                 JournalFile *f,
643                 Object *o,
644                 uint64_t offset,
645                 uint64_t hash) {
646
647         uint64_t p, h;
648         int r;
649
650         assert(f);
651         assert(o);
652         assert(offset > 0);
653
654         if (o->object.type != OBJECT_FIELD)
655                 return -EINVAL;
656
657         /* This might alter the window we are looking at */
658
659         o->field.next_hash_offset = o->field.head_data_offset = 0;
660
661         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
662         p = le64toh(f->field_hash_table[h].tail_hash_offset);
663         if (p == 0)
664                 f->field_hash_table[h].head_hash_offset = htole64(offset);
665         else {
666                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
667                 if (r < 0)
668                         return r;
669
670                 o->field.next_hash_offset = htole64(offset);
671         }
672
673         f->field_hash_table[h].tail_hash_offset = htole64(offset);
674
675         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
676                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
677
678         return 0;
679 }
680
681 static int journal_file_link_data(
682                 JournalFile *f,
683                 Object *o,
684                 uint64_t offset,
685                 uint64_t hash) {
686
687         uint64_t p, h;
688         int r;
689
690         assert(f);
691         assert(o);
692         assert(offset > 0);
693
694         if (o->object.type != OBJECT_DATA)
695                 return -EINVAL;
696
697         /* This might alter the window we are looking at */
698
699         o->data.next_hash_offset = o->data.next_field_offset = 0;
700         o->data.entry_offset = o->data.entry_array_offset = 0;
701         o->data.n_entries = 0;
702
703         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
704         p = le64toh(f->data_hash_table[h].tail_hash_offset);
705         if (p == 0)
706                 /* Only entry in the hash table is easy */
707                 f->data_hash_table[h].head_hash_offset = htole64(offset);
708         else {
709                 /* Move back to the previous data object, to patch in
710                  * pointer */
711
712                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
713                 if (r < 0)
714                         return r;
715
716                 o->data.next_hash_offset = htole64(offset);
717         }
718
719         f->data_hash_table[h].tail_hash_offset = htole64(offset);
720
721         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
722                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
723
724         return 0;
725 }
726
727 int journal_file_find_field_object_with_hash(
728                 JournalFile *f,
729                 const void *field, uint64_t size, uint64_t hash,
730                 Object **ret, uint64_t *offset) {
731
732         uint64_t p, osize, h;
733         int r;
734
735         assert(f);
736         assert(field && size > 0);
737
738         osize = offsetof(Object, field.payload) + size;
739
740         if (f->header->field_hash_table_size == 0)
741                 return -EBADMSG;
742
743         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
744         p = le64toh(f->field_hash_table[h].head_hash_offset);
745
746         while (p > 0) {
747                 Object *o;
748
749                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
750                 if (r < 0)
751                         return r;
752
753                 if (le64toh(o->field.hash) == hash &&
754                     le64toh(o->object.size) == osize &&
755                     memcmp(o->field.payload, field, size) == 0) {
756
757                         if (ret)
758                                 *ret = o;
759                         if (offset)
760                                 *offset = p;
761
762                         return 1;
763                 }
764
765                 p = le64toh(o->field.next_hash_offset);
766         }
767
768         return 0;
769 }
770
771 int journal_file_find_field_object(
772                 JournalFile *f,
773                 const void *field, uint64_t size,
774                 Object **ret, uint64_t *offset) {
775
776         uint64_t hash;
777
778         assert(f);
779         assert(field && size > 0);
780
781         hash = hash64(field, size);
782
783         return journal_file_find_field_object_with_hash(f,
784                                                         field, size, hash,
785                                                         ret, offset);
786 }
787
788 int journal_file_find_data_object_with_hash(
789                 JournalFile *f,
790                 const void *data, uint64_t size, uint64_t hash,
791                 Object **ret, uint64_t *offset) {
792
793         uint64_t p, osize, h;
794         int r;
795
796         assert(f);
797         assert(data || size == 0);
798
799         osize = offsetof(Object, data.payload) + size;
800
801         if (f->header->data_hash_table_size == 0)
802                 return -EBADMSG;
803
804         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
805         p = le64toh(f->data_hash_table[h].head_hash_offset);
806
807         while (p > 0) {
808                 Object *o;
809
810                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
811                 if (r < 0)
812                         return r;
813
814                 if (le64toh(o->data.hash) != hash)
815                         goto next;
816
817                 if (o->object.flags & OBJECT_COMPRESSED) {
818 #ifdef HAVE_XZ
819                         uint64_t l, rsize;
820
821                         l = le64toh(o->object.size);
822                         if (l <= offsetof(Object, data.payload))
823                                 return -EBADMSG;
824
825                         l -= offsetof(Object, data.payload);
826
827                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
828                                 return -EBADMSG;
829
830                         if (rsize == size &&
831                             memcmp(f->compress_buffer, data, size) == 0) {
832
833                                 if (ret)
834                                         *ret = o;
835
836                                 if (offset)
837                                         *offset = p;
838
839                                 return 1;
840                         }
841 #else
842                         return -EPROTONOSUPPORT;
843 #endif
844
845                 } else if (le64toh(o->object.size) == osize &&
846                            memcmp(o->data.payload, data, size) == 0) {
847
848                         if (ret)
849                                 *ret = o;
850
851                         if (offset)
852                                 *offset = p;
853
854                         return 1;
855                 }
856
857         next:
858                 p = le64toh(o->data.next_hash_offset);
859         }
860
861         return 0;
862 }
863
864 int journal_file_find_data_object(
865                 JournalFile *f,
866                 const void *data, uint64_t size,
867                 Object **ret, uint64_t *offset) {
868
869         uint64_t hash;
870
871         assert(f);
872         assert(data || size == 0);
873
874         hash = hash64(data, size);
875
876         return journal_file_find_data_object_with_hash(f,
877                                                        data, size, hash,
878                                                        ret, offset);
879 }
880
881 static int journal_file_append_field(
882                 JournalFile *f,
883                 const void *field, uint64_t size,
884                 Object **ret, uint64_t *offset) {
885
886         uint64_t hash, p;
887         uint64_t osize;
888         Object *o;
889         int r;
890
891         assert(f);
892         assert(field && size > 0);
893
894         hash = hash64(field, size);
895
896         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
897         if (r < 0)
898                 return r;
899         else if (r > 0) {
900
901                 if (ret)
902                         *ret = o;
903
904                 if (offset)
905                         *offset = p;
906
907                 return 0;
908         }
909
910         osize = offsetof(Object, field.payload) + size;
911         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
912         if (r < 0)
913                 return r;
914
915         o->field.hash = htole64(hash);
916         memcpy(o->field.payload, field, size);
917
918         r = journal_file_link_field(f, o, p, hash);
919         if (r < 0)
920                 return r;
921
922         /* The linking might have altered the window, so let's
923          * refresh our pointer */
924         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
925         if (r < 0)
926                 return r;
927
928 #ifdef HAVE_GCRYPT
929         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
930         if (r < 0)
931                 return r;
932 #endif
933
934         if (ret)
935                 *ret = o;
936
937         if (offset)
938                 *offset = p;
939
940         return 0;
941 }
942
943 static int journal_file_append_data(
944                 JournalFile *f,
945                 const void *data, uint64_t size,
946                 Object **ret, uint64_t *offset) {
947
948         uint64_t hash, p;
949         uint64_t osize;
950         Object *o;
951         int r;
952         bool compressed = false;
953         const void *eq;
954
955         assert(f);
956         assert(data || size == 0);
957
958         hash = hash64(data, size);
959
960         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
961         if (r < 0)
962                 return r;
963         else if (r > 0) {
964
965                 if (ret)
966                         *ret = o;
967
968                 if (offset)
969                         *offset = p;
970
971                 return 0;
972         }
973
974         osize = offsetof(Object, data.payload) + size;
975         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
976         if (r < 0)
977                 return r;
978
979         o->data.hash = htole64(hash);
980
981 #ifdef HAVE_XZ
982         if (f->compress &&
983             size >= COMPRESSION_SIZE_THRESHOLD) {
984                 uint64_t rsize;
985
986                 compressed = compress_blob(data, size, o->data.payload, &rsize);
987
988                 if (compressed) {
989                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
990                         o->object.flags |= OBJECT_COMPRESSED;
991
992                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
993                 }
994         }
995 #endif
996
997         if (!compressed && size > 0)
998                 memcpy(o->data.payload, data, size);
999
1000         r = journal_file_link_data(f, o, p, hash);
1001         if (r < 0)
1002                 return r;
1003
1004         /* The linking might have altered the window, so let's
1005          * refresh our pointer */
1006         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1007         if (r < 0)
1008                 return r;
1009
1010         if (!data)
1011                 eq = NULL;
1012         else
1013                 eq = memchr(data, '=', size);
1014         if (eq && eq > data) {
1015                 Object *fo = NULL;
1016                 uint64_t fp;
1017
1018                 /* Create field object ... */
1019                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1020                 if (r < 0)
1021                         return r;
1022
1023                 /* ... and link it in. */
1024                 o->data.next_field_offset = fo->field.head_data_offset;
1025                 fo->field.head_data_offset = le64toh(p);
1026         }
1027
1028 #ifdef HAVE_GCRYPT
1029         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1030         if (r < 0)
1031                 return r;
1032 #endif
1033
1034         if (ret)
1035                 *ret = o;
1036
1037         if (offset)
1038                 *offset = p;
1039
1040         return 0;
1041 }
1042
1043 uint64_t journal_file_entry_n_items(Object *o) {
1044         assert(o);
1045
1046         if (o->object.type != OBJECT_ENTRY)
1047                 return 0;
1048
1049         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1050 }
1051
1052 uint64_t journal_file_entry_array_n_items(Object *o) {
1053         assert(o);
1054
1055         if (o->object.type != OBJECT_ENTRY_ARRAY)
1056                 return 0;
1057
1058         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1059 }
1060
1061 uint64_t journal_file_hash_table_n_items(Object *o) {
1062         assert(o);
1063
1064         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1065             o->object.type != OBJECT_FIELD_HASH_TABLE)
1066                 return 0;
1067
1068         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1069 }
1070
1071 static int link_entry_into_array(JournalFile *f,
1072                                  le64_t *first,
1073                                  le64_t *idx,
1074                                  uint64_t p) {
1075         int r;
1076         uint64_t n = 0, ap = 0, q, i, a, hidx;
1077         Object *o;
1078
1079         assert(f);
1080         assert(first);
1081         assert(idx);
1082         assert(p > 0);
1083
1084         a = le64toh(*first);
1085         i = hidx = le64toh(*idx);
1086         while (a > 0) {
1087
1088                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1089                 if (r < 0)
1090                         return r;
1091
1092                 n = journal_file_entry_array_n_items(o);
1093                 if (i < n) {
1094                         o->entry_array.items[i] = htole64(p);
1095                         *idx = htole64(hidx + 1);
1096                         return 0;
1097                 }
1098
1099                 i -= n;
1100                 ap = a;
1101                 a = le64toh(o->entry_array.next_entry_array_offset);
1102         }
1103
1104         if (hidx > n)
1105                 n = (hidx+1) * 2;
1106         else
1107                 n = n * 2;
1108
1109         if (n < 4)
1110                 n = 4;
1111
1112         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1113                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1114                                        &o, &q);
1115         if (r < 0)
1116                 return r;
1117
1118 #ifdef HAVE_GCRYPT
1119         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1120         if (r < 0)
1121                 return r;
1122 #endif
1123
1124         o->entry_array.items[i] = htole64(p);
1125
1126         if (ap == 0)
1127                 *first = htole64(q);
1128         else {
1129                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1130                 if (r < 0)
1131                         return r;
1132
1133                 o->entry_array.next_entry_array_offset = htole64(q);
1134         }
1135
1136         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1137                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1138
1139         *idx = htole64(hidx + 1);
1140
1141         return 0;
1142 }
1143
1144 static int link_entry_into_array_plus_one(JournalFile *f,
1145                                           le64_t *extra,
1146                                           le64_t *first,
1147                                           le64_t *idx,
1148                                           uint64_t p) {
1149
1150         int r;
1151
1152         assert(f);
1153         assert(extra);
1154         assert(first);
1155         assert(idx);
1156         assert(p > 0);
1157
1158         if (*idx == 0)
1159                 *extra = htole64(p);
1160         else {
1161                 le64_t i;
1162
1163                 i = htole64(le64toh(*idx) - 1);
1164                 r = link_entry_into_array(f, first, &i, p);
1165                 if (r < 0)
1166                         return r;
1167         }
1168
1169         *idx = htole64(le64toh(*idx) + 1);
1170         return 0;
1171 }
1172
1173 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1174         uint64_t p;
1175         int r;
1176         assert(f);
1177         assert(o);
1178         assert(offset > 0);
1179
1180         p = le64toh(o->entry.items[i].object_offset);
1181         if (p == 0)
1182                 return -EINVAL;
1183
1184         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1185         if (r < 0)
1186                 return r;
1187
1188         return link_entry_into_array_plus_one(f,
1189                                               &o->data.entry_offset,
1190                                               &o->data.entry_array_offset,
1191                                               &o->data.n_entries,
1192                                               offset);
1193 }
1194
1195 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1196         uint64_t n, i;
1197         int r;
1198
1199         assert(f);
1200         assert(o);
1201         assert(offset > 0);
1202
1203         if (o->object.type != OBJECT_ENTRY)
1204                 return -EINVAL;
1205
1206         __sync_synchronize();
1207
1208         /* Link up the entry itself */
1209         r = link_entry_into_array(f,
1210                                   &f->header->entry_array_offset,
1211                                   &f->header->n_entries,
1212                                   offset);
1213         if (r < 0)
1214                 return r;
1215
1216         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1217
1218         if (f->header->head_entry_realtime == 0)
1219                 f->header->head_entry_realtime = o->entry.realtime;
1220
1221         f->header->tail_entry_realtime = o->entry.realtime;
1222         f->header->tail_entry_monotonic = o->entry.monotonic;
1223
1224         f->tail_entry_monotonic_valid = true;
1225
1226         /* Link up the items */
1227         n = journal_file_entry_n_items(o);
1228         for (i = 0; i < n; i++) {
1229                 r = journal_file_link_entry_item(f, o, offset, i);
1230                 if (r < 0)
1231                         return r;
1232         }
1233
1234         return 0;
1235 }
1236
1237 static int journal_file_append_entry_internal(
1238                 JournalFile *f,
1239                 const dual_timestamp *ts,
1240                 uint64_t xor_hash,
1241                 const EntryItem items[], unsigned n_items,
1242                 uint64_t *seqnum,
1243                 Object **ret, uint64_t *offset) {
1244         uint64_t np;
1245         uint64_t osize;
1246         Object *o;
1247         int r;
1248
1249         assert(f);
1250         assert(items || n_items == 0);
1251         assert(ts);
1252
1253         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1254
1255         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1256         if (r < 0)
1257                 return r;
1258
1259         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1260         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1261         o->entry.realtime = htole64(ts->realtime);
1262         o->entry.monotonic = htole64(ts->monotonic);
1263         o->entry.xor_hash = htole64(xor_hash);
1264         o->entry.boot_id = f->header->boot_id;
1265
1266 #ifdef HAVE_GCRYPT
1267         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1268         if (r < 0)
1269                 return r;
1270 #endif
1271
1272         r = journal_file_link_entry(f, o, np);
1273         if (r < 0)
1274                 return r;
1275
1276         if (ret)
1277                 *ret = o;
1278
1279         if (offset)
1280                 *offset = np;
1281
1282         return 0;
1283 }
1284
1285 void journal_file_post_change(JournalFile *f) {
1286         assert(f);
1287
1288         /* inotify() does not receive IN_MODIFY events from file
1289          * accesses done via mmap(). After each access we hence
1290          * trigger IN_MODIFY by truncating the journal file to its
1291          * current size which triggers IN_MODIFY. */
1292
1293         __sync_synchronize();
1294
1295         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1296                 log_error("Failed to truncate file to its own size: %m");
1297 }
1298
1299 static int entry_item_cmp(const void *_a, const void *_b) {
1300         const EntryItem *a = _a, *b = _b;
1301
1302         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1303                 return -1;
1304         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1305                 return 1;
1306         return 0;
1307 }
1308
1309 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1310         unsigned i;
1311         EntryItem *items;
1312         int r;
1313         uint64_t xor_hash = 0;
1314         struct dual_timestamp _ts;
1315
1316         assert(f);
1317         assert(iovec || n_iovec == 0);
1318
1319         if (!ts) {
1320                 dual_timestamp_get(&_ts);
1321                 ts = &_ts;
1322         }
1323
1324         if (f->tail_entry_monotonic_valid &&
1325             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1326                 return -EINVAL;
1327
1328 #ifdef HAVE_GCRYPT
1329         r = journal_file_maybe_append_tag(f, ts->realtime);
1330         if (r < 0)
1331                 return r;
1332 #endif
1333
1334         /* alloca() can't take 0, hence let's allocate at least one */
1335         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1336
1337         for (i = 0; i < n_iovec; i++) {
1338                 uint64_t p;
1339                 Object *o;
1340
1341                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1342                 if (r < 0)
1343                         return r;
1344
1345                 xor_hash ^= le64toh(o->data.hash);
1346                 items[i].object_offset = htole64(p);
1347                 items[i].hash = o->data.hash;
1348         }
1349
1350         /* Order by the position on disk, in order to improve seek
1351          * times for rotating media. */
1352         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1353
1354         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1355
1356         journal_file_post_change(f);
1357
1358         return r;
1359 }
1360
1361 typedef struct ChainCacheItem {
1362         uint64_t first; /* the array at the begin of the chain */
1363         uint64_t array; /* the cached array */
1364         uint64_t begin; /* the first item in the cached array */
1365         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1366         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1367 } ChainCacheItem;
1368
1369 static void chain_cache_put(
1370                 Hashmap *h,
1371                 ChainCacheItem *ci,
1372                 uint64_t first,
1373                 uint64_t array,
1374                 uint64_t begin,
1375                 uint64_t total,
1376                 uint64_t last_index) {
1377
1378         if (!ci) {
1379                 /* If the chain item to cache for this chain is the
1380                  * first one it's not worth caching anything */
1381                 if (array == first)
1382                         return;
1383
1384                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1385                         ci = hashmap_steal_first(h);
1386                 else {
1387                         ci = new(ChainCacheItem, 1);
1388                         if (!ci)
1389                                 return;
1390                 }
1391
1392                 ci->first = first;
1393
1394                 if (hashmap_put(h, &ci->first, ci) < 0) {
1395                         free(ci);
1396                         return;
1397                 }
1398         } else
1399                 assert(ci->first == first);
1400
1401         ci->array = array;
1402         ci->begin = begin;
1403         ci->total = total;
1404         ci->last_index = last_index;
1405 }
1406
1407 static int generic_array_get(
1408                 JournalFile *f,
1409                 uint64_t first,
1410                 uint64_t i,
1411                 Object **ret, uint64_t *offset) {
1412
1413         Object *o;
1414         uint64_t p = 0, a, t = 0;
1415         int r;
1416         ChainCacheItem *ci;
1417
1418         assert(f);
1419
1420         a = first;
1421
1422         /* Try the chain cache first */
1423         ci = hashmap_get(f->chain_cache, &first);
1424         if (ci && i > ci->total) {
1425                 a = ci->array;
1426                 i -= ci->total;
1427                 t = ci->total;
1428         }
1429
1430         while (a > 0) {
1431                 uint64_t k;
1432
1433                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1434                 if (r < 0)
1435                         return r;
1436
1437                 k = journal_file_entry_array_n_items(o);
1438                 if (i < k) {
1439                         p = le64toh(o->entry_array.items[i]);
1440                         goto found;
1441                 }
1442
1443                 i -= k;
1444                 t += k;
1445                 a = le64toh(o->entry_array.next_entry_array_offset);
1446         }
1447
1448         return 0;
1449
1450 found:
1451         /* Let's cache this item for the next invocation */
1452         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1453
1454         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455         if (r < 0)
1456                 return r;
1457
1458         if (ret)
1459                 *ret = o;
1460
1461         if (offset)
1462                 *offset = p;
1463
1464         return 1;
1465 }
1466
1467 static int generic_array_get_plus_one(
1468                 JournalFile *f,
1469                 uint64_t extra,
1470                 uint64_t first,
1471                 uint64_t i,
1472                 Object **ret, uint64_t *offset) {
1473
1474         Object *o;
1475
1476         assert(f);
1477
1478         if (i == 0) {
1479                 int r;
1480
1481                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1482                 if (r < 0)
1483                         return r;
1484
1485                 if (ret)
1486                         *ret = o;
1487
1488                 if (offset)
1489                         *offset = extra;
1490
1491                 return 1;
1492         }
1493
1494         return generic_array_get(f, first, i-1, ret, offset);
1495 }
1496
1497 enum {
1498         TEST_FOUND,
1499         TEST_LEFT,
1500         TEST_RIGHT
1501 };
1502
1503 static int generic_array_bisect(
1504                 JournalFile *f,
1505                 uint64_t first,
1506                 uint64_t n,
1507                 uint64_t needle,
1508                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509                 direction_t direction,
1510                 Object **ret,
1511                 uint64_t *offset,
1512                 uint64_t *idx) {
1513
1514         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1515         bool subtract_one = false;
1516         Object *o, *array = NULL;
1517         int r;
1518         ChainCacheItem *ci;
1519
1520         assert(f);
1521         assert(test_object);
1522
1523         /* Start with the first array in the chain */
1524         a = first;
1525
1526         ci = hashmap_get(f->chain_cache, &first);
1527         if (ci && n > ci->total) {
1528                 /* Ah, we have iterated this bisection array chain
1529                  * previously! Let's see if we can skip ahead in the
1530                  * chain, as far as the last time. But we can't jump
1531                  * backwards in the chain, so let's check that
1532                  * first. */
1533
1534                 r = test_object(f, ci->begin, needle);
1535                 if (r < 0)
1536                         return r;
1537
1538                 if (r == TEST_LEFT) {
1539                         /* OK, what we are looking for is right of the
1540                          * begin of this EntryArray, so let's jump
1541                          * straight to previously cached array in the
1542                          * chain */
1543
1544                         a = ci->array;
1545                         n -= ci->total;
1546                         t = ci->total;
1547                         last_index = ci->last_index;
1548                 }
1549         }
1550
1551         while (a > 0) {
1552                 uint64_t left, right, k, lp;
1553
1554                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1555                 if (r < 0)
1556                         return r;
1557
1558                 k = journal_file_entry_array_n_items(array);
1559                 right = MIN(k, n);
1560                 if (right <= 0)
1561                         return 0;
1562
1563                 i = right - 1;
1564                 lp = p = le64toh(array->entry_array.items[i]);
1565                 if (p <= 0)
1566                         return -EBADMSG;
1567
1568                 r = test_object(f, p, needle);
1569                 if (r < 0)
1570                         return r;
1571
1572                 if (r == TEST_FOUND)
1573                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574
1575                 if (r == TEST_RIGHT) {
1576                         left = 0;
1577                         right -= 1;
1578
1579                         if (last_index != (uint64_t) -1) {
1580                                 assert(last_index <= right);
1581
1582                                 /* If we cached the last index we
1583                                  * looked at, let's try to not to jump
1584                                  * too wildly around and see if we can
1585                                  * limit the range to look at early to
1586                                  * the immediate neighbors of the last
1587                                  * index we looked at. */
1588
1589                                 if (last_index > 0) {
1590                                         uint64_t x = last_index - 1;
1591
1592                                         p = le64toh(array->entry_array.items[x]);
1593                                         if (p <= 0)
1594                                                 return -EBADMSG;
1595
1596                                         r = test_object(f, p, needle);
1597                                         if (r < 0)
1598                                                 return r;
1599
1600                                         if (r == TEST_FOUND)
1601                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602
1603                                         if (r == TEST_RIGHT)
1604                                                 right = x;
1605                                         else
1606                                                 left = x + 1;
1607                                 }
1608
1609                                 if (last_index < right) {
1610                                         uint64_t y = last_index + 1;
1611
1612                                         p = le64toh(array->entry_array.items[y]);
1613                                         if (p <= 0)
1614                                                 return -EBADMSG;
1615
1616                                         r = test_object(f, p, needle);
1617                                         if (r < 0)
1618                                                 return r;
1619
1620                                         if (r == TEST_FOUND)
1621                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622
1623                                         if (r == TEST_RIGHT)
1624                                                 right = y;
1625                                         else
1626                                                 left = y + 1;
1627                                 }
1628                         }
1629
1630                         for (;;) {
1631                                 if (left == right) {
1632                                         if (direction == DIRECTION_UP)
1633                                                 subtract_one = true;
1634
1635                                         i = left;
1636                                         goto found;
1637                                 }
1638
1639                                 assert(left < right);
1640                                 i = (left + right) / 2;
1641
1642                                 p = le64toh(array->entry_array.items[i]);
1643                                 if (p <= 0)
1644                                         return -EBADMSG;
1645
1646                                 r = test_object(f, p, needle);
1647                                 if (r < 0)
1648                                         return r;
1649
1650                                 if (r == TEST_FOUND)
1651                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1652
1653                                 if (r == TEST_RIGHT)
1654                                         right = i;
1655                                 else
1656                                         left = i + 1;
1657                         }
1658                 }
1659
1660                 if (k > n) {
1661                         if (direction == DIRECTION_UP) {
1662                                 i = n;
1663                                 subtract_one = true;
1664                                 goto found;
1665                         }
1666
1667                         return 0;
1668                 }
1669
1670                 last_p = lp;
1671
1672                 n -= k;
1673                 t += k;
1674                 last_index = (uint64_t) -1;
1675                 a = le64toh(array->entry_array.next_entry_array_offset);
1676         }
1677
1678         return 0;
1679
1680 found:
1681         if (subtract_one && t == 0 && i == 0)
1682                 return 0;
1683
1684         /* Let's cache this item for the next invocation */
1685         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1686
1687         if (subtract_one && i == 0)
1688                 p = last_p;
1689         else if (subtract_one)
1690                 p = le64toh(array->entry_array.items[i-1]);
1691         else
1692                 p = le64toh(array->entry_array.items[i]);
1693
1694         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1695         if (r < 0)
1696                 return r;
1697
1698         if (ret)
1699                 *ret = o;
1700
1701         if (offset)
1702                 *offset = p;
1703
1704         if (idx)
1705                 *idx = t + i + (subtract_one ? -1 : 0);
1706
1707         return 1;
1708 }
1709
1710
1711 static int generic_array_bisect_plus_one(
1712                 JournalFile *f,
1713                 uint64_t extra,
1714                 uint64_t first,
1715                 uint64_t n,
1716                 uint64_t needle,
1717                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1718                 direction_t direction,
1719                 Object **ret,
1720                 uint64_t *offset,
1721                 uint64_t *idx) {
1722
1723         int r;
1724         bool step_back = false;
1725         Object *o;
1726
1727         assert(f);
1728         assert(test_object);
1729
1730         if (n <= 0)
1731                 return 0;
1732
1733         /* This bisects the array in object 'first', but first checks
1734          * an extra  */
1735         r = test_object(f, extra, needle);
1736         if (r < 0)
1737                 return r;
1738
1739         if (r == TEST_FOUND)
1740                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1741
1742         /* if we are looking with DIRECTION_UP then we need to first
1743            see if in the actual array there is a matching entry, and
1744            return the last one of that. But if there isn't any we need
1745            to return this one. Hence remember this, and return it
1746            below. */
1747         if (r == TEST_LEFT)
1748                 step_back = direction == DIRECTION_UP;
1749
1750         if (r == TEST_RIGHT) {
1751                 if (direction == DIRECTION_DOWN)
1752                         goto found;
1753                 else
1754                         return 0;
1755         }
1756
1757         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1758
1759         if (r == 0 && step_back)
1760                 goto found;
1761
1762         if (r > 0 && idx)
1763                 (*idx) ++;
1764
1765         return r;
1766
1767 found:
1768         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1769         if (r < 0)
1770                 return r;
1771
1772         if (ret)
1773                 *ret = o;
1774
1775         if (offset)
1776                 *offset = extra;
1777
1778         if (idx)
1779                 *idx = 0;
1780
1781         return 1;
1782 }
1783
1784 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1785         assert(f);
1786         assert(p > 0);
1787
1788         if (p == needle)
1789                 return TEST_FOUND;
1790         else if (p < needle)
1791                 return TEST_LEFT;
1792         else
1793                 return TEST_RIGHT;
1794 }
1795
1796 int journal_file_move_to_entry_by_offset(
1797                 JournalFile *f,
1798                 uint64_t p,
1799                 direction_t direction,
1800                 Object **ret,
1801                 uint64_t *offset) {
1802
1803         return generic_array_bisect(f,
1804                                     le64toh(f->header->entry_array_offset),
1805                                     le64toh(f->header->n_entries),
1806                                     p,
1807                                     test_object_offset,
1808                                     direction,
1809                                     ret, offset, NULL);
1810 }
1811
1812
1813 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1814         Object *o;
1815         int r;
1816
1817         assert(f);
1818         assert(p > 0);
1819
1820         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1821         if (r < 0)
1822                 return r;
1823
1824         if (le64toh(o->entry.seqnum) == needle)
1825                 return TEST_FOUND;
1826         else if (le64toh(o->entry.seqnum) < needle)
1827                 return TEST_LEFT;
1828         else
1829                 return TEST_RIGHT;
1830 }
1831
1832 int journal_file_move_to_entry_by_seqnum(
1833                 JournalFile *f,
1834                 uint64_t seqnum,
1835                 direction_t direction,
1836                 Object **ret,
1837                 uint64_t *offset) {
1838
1839         return generic_array_bisect(f,
1840                                     le64toh(f->header->entry_array_offset),
1841                                     le64toh(f->header->n_entries),
1842                                     seqnum,
1843                                     test_object_seqnum,
1844                                     direction,
1845                                     ret, offset, NULL);
1846 }
1847
1848 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1849         Object *o;
1850         int r;
1851
1852         assert(f);
1853         assert(p > 0);
1854
1855         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1856         if (r < 0)
1857                 return r;
1858
1859         if (le64toh(o->entry.realtime) == needle)
1860                 return TEST_FOUND;
1861         else if (le64toh(o->entry.realtime) < needle)
1862                 return TEST_LEFT;
1863         else
1864                 return TEST_RIGHT;
1865 }
1866
1867 int journal_file_move_to_entry_by_realtime(
1868                 JournalFile *f,
1869                 uint64_t realtime,
1870                 direction_t direction,
1871                 Object **ret,
1872                 uint64_t *offset) {
1873
1874         return generic_array_bisect(f,
1875                                     le64toh(f->header->entry_array_offset),
1876                                     le64toh(f->header->n_entries),
1877                                     realtime,
1878                                     test_object_realtime,
1879                                     direction,
1880                                     ret, offset, NULL);
1881 }
1882
1883 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1884         Object *o;
1885         int r;
1886
1887         assert(f);
1888         assert(p > 0);
1889
1890         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1891         if (r < 0)
1892                 return r;
1893
1894         if (le64toh(o->entry.monotonic) == needle)
1895                 return TEST_FOUND;
1896         else if (le64toh(o->entry.monotonic) < needle)
1897                 return TEST_LEFT;
1898         else
1899                 return TEST_RIGHT;
1900 }
1901
1902 static inline int find_data_object_by_boot_id(
1903                 JournalFile *f,
1904                 sd_id128_t boot_id,
1905                 Object **o,
1906                 uint64_t *b) {
1907         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1908
1909         sd_id128_to_string(boot_id, t + 9);
1910         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1911 }
1912
1913 int journal_file_move_to_entry_by_monotonic(
1914                 JournalFile *f,
1915                 sd_id128_t boot_id,
1916                 uint64_t monotonic,
1917                 direction_t direction,
1918                 Object **ret,
1919                 uint64_t *offset) {
1920
1921         Object *o;
1922         int r;
1923
1924         assert(f);
1925
1926         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1927         if (r < 0)
1928                 return r;
1929         if (r == 0)
1930                 return -ENOENT;
1931
1932         return generic_array_bisect_plus_one(f,
1933                                              le64toh(o->data.entry_offset),
1934                                              le64toh(o->data.entry_array_offset),
1935                                              le64toh(o->data.n_entries),
1936                                              monotonic,
1937                                              test_object_monotonic,
1938                                              direction,
1939                                              ret, offset, NULL);
1940 }
1941
1942 int journal_file_next_entry(
1943                 JournalFile *f,
1944                 Object *o, uint64_t p,
1945                 direction_t direction,
1946                 Object **ret, uint64_t *offset) {
1947
1948         uint64_t i, n;
1949         int r;
1950
1951         assert(f);
1952         assert(p > 0 || !o);
1953
1954         n = le64toh(f->header->n_entries);
1955         if (n <= 0)
1956                 return 0;
1957
1958         if (!o)
1959                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1960         else {
1961                 if (o->object.type != OBJECT_ENTRY)
1962                         return -EINVAL;
1963
1964                 r = generic_array_bisect(f,
1965                                          le64toh(f->header->entry_array_offset),
1966                                          le64toh(f->header->n_entries),
1967                                          p,
1968                                          test_object_offset,
1969                                          DIRECTION_DOWN,
1970                                          NULL, NULL,
1971                                          &i);
1972                 if (r <= 0)
1973                         return r;
1974
1975                 if (direction == DIRECTION_DOWN) {
1976                         if (i >= n - 1)
1977                                 return 0;
1978
1979                         i++;
1980                 } else {
1981                         if (i <= 0)
1982                                 return 0;
1983
1984                         i--;
1985                 }
1986         }
1987
1988         /* And jump to it */
1989         return generic_array_get(f,
1990                                  le64toh(f->header->entry_array_offset),
1991                                  i,
1992                                  ret, offset);
1993 }
1994
1995 int journal_file_skip_entry(
1996                 JournalFile *f,
1997                 Object *o, uint64_t p,
1998                 int64_t skip,
1999                 Object **ret, uint64_t *offset) {
2000
2001         uint64_t i, n;
2002         int r;
2003
2004         assert(f);
2005         assert(o);
2006         assert(p > 0);
2007
2008         if (o->object.type != OBJECT_ENTRY)
2009                 return -EINVAL;
2010
2011         r = generic_array_bisect(f,
2012                                  le64toh(f->header->entry_array_offset),
2013                                  le64toh(f->header->n_entries),
2014                                  p,
2015                                  test_object_offset,
2016                                  DIRECTION_DOWN,
2017                                  NULL, NULL,
2018                                  &i);
2019         if (r <= 0)
2020                 return r;
2021
2022         /* Calculate new index */
2023         if (skip < 0) {
2024                 if ((uint64_t) -skip >= i)
2025                         i = 0;
2026                 else
2027                         i = i - (uint64_t) -skip;
2028         } else
2029                 i  += (uint64_t) skip;
2030
2031         n = le64toh(f->header->n_entries);
2032         if (n <= 0)
2033                 return -EBADMSG;
2034
2035         if (i >= n)
2036                 i = n-1;
2037
2038         return generic_array_get(f,
2039                                  le64toh(f->header->entry_array_offset),
2040                                  i,
2041                                  ret, offset);
2042 }
2043
2044 int journal_file_next_entry_for_data(
2045                 JournalFile *f,
2046                 Object *o, uint64_t p,
2047                 uint64_t data_offset,
2048                 direction_t direction,
2049                 Object **ret, uint64_t *offset) {
2050
2051         uint64_t n, i;
2052         int r;
2053         Object *d;
2054
2055         assert(f);
2056         assert(p > 0 || !o);
2057
2058         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2059         if (r < 0)
2060                 return r;
2061
2062         n = le64toh(d->data.n_entries);
2063         if (n <= 0)
2064                 return n;
2065
2066         if (!o)
2067                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2068         else {
2069                 if (o->object.type != OBJECT_ENTRY)
2070                         return -EINVAL;
2071
2072                 r = generic_array_bisect_plus_one(f,
2073                                                   le64toh(d->data.entry_offset),
2074                                                   le64toh(d->data.entry_array_offset),
2075                                                   le64toh(d->data.n_entries),
2076                                                   p,
2077                                                   test_object_offset,
2078                                                   DIRECTION_DOWN,
2079                                                   NULL, NULL,
2080                                                   &i);
2081
2082                 if (r <= 0)
2083                         return r;
2084
2085                 if (direction == DIRECTION_DOWN) {
2086                         if (i >= n - 1)
2087                                 return 0;
2088
2089                         i++;
2090                 } else {
2091                         if (i <= 0)
2092                                 return 0;
2093
2094                         i--;
2095                 }
2096
2097         }
2098
2099         return generic_array_get_plus_one(f,
2100                                           le64toh(d->data.entry_offset),
2101                                           le64toh(d->data.entry_array_offset),
2102                                           i,
2103                                           ret, offset);
2104 }
2105
2106 int journal_file_move_to_entry_by_offset_for_data(
2107                 JournalFile *f,
2108                 uint64_t data_offset,
2109                 uint64_t p,
2110                 direction_t direction,
2111                 Object **ret, uint64_t *offset) {
2112
2113         int r;
2114         Object *d;
2115
2116         assert(f);
2117
2118         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2119         if (r < 0)
2120                 return r;
2121
2122         return generic_array_bisect_plus_one(f,
2123                                              le64toh(d->data.entry_offset),
2124                                              le64toh(d->data.entry_array_offset),
2125                                              le64toh(d->data.n_entries),
2126                                              p,
2127                                              test_object_offset,
2128                                              direction,
2129                                              ret, offset, NULL);
2130 }
2131
2132 int journal_file_move_to_entry_by_monotonic_for_data(
2133                 JournalFile *f,
2134                 uint64_t data_offset,
2135                 sd_id128_t boot_id,
2136                 uint64_t monotonic,
2137                 direction_t direction,
2138                 Object **ret, uint64_t *offset) {
2139
2140         Object *o, *d;
2141         int r;
2142         uint64_t b, z;
2143
2144         assert(f);
2145
2146         /* First, seek by time */
2147         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2148         if (r < 0)
2149                 return r;
2150         if (r == 0)
2151                 return -ENOENT;
2152
2153         r = generic_array_bisect_plus_one(f,
2154                                           le64toh(o->data.entry_offset),
2155                                           le64toh(o->data.entry_array_offset),
2156                                           le64toh(o->data.n_entries),
2157                                           monotonic,
2158                                           test_object_monotonic,
2159                                           direction,
2160                                           NULL, &z, NULL);
2161         if (r <= 0)
2162                 return r;
2163
2164         /* And now, continue seeking until we find an entry that
2165          * exists in both bisection arrays */
2166
2167         for (;;) {
2168                 Object *qo;
2169                 uint64_t p, q;
2170
2171                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2172                 if (r < 0)
2173                         return r;
2174
2175                 r = generic_array_bisect_plus_one(f,
2176                                                   le64toh(d->data.entry_offset),
2177                                                   le64toh(d->data.entry_array_offset),
2178                                                   le64toh(d->data.n_entries),
2179                                                   z,
2180                                                   test_object_offset,
2181                                                   direction,
2182                                                   NULL, &p, NULL);
2183                 if (r <= 0)
2184                         return r;
2185
2186                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2187                 if (r < 0)
2188                         return r;
2189
2190                 r = generic_array_bisect_plus_one(f,
2191                                                   le64toh(o->data.entry_offset),
2192                                                   le64toh(o->data.entry_array_offset),
2193                                                   le64toh(o->data.n_entries),
2194                                                   p,
2195                                                   test_object_offset,
2196                                                   direction,
2197                                                   &qo, &q, NULL);
2198
2199                 if (r <= 0)
2200                         return r;
2201
2202                 if (p == q) {
2203                         if (ret)
2204                                 *ret = qo;
2205                         if (offset)
2206                                 *offset = q;
2207
2208                         return 1;
2209                 }
2210
2211                 z = q;
2212         }
2213 }
2214
2215 int journal_file_move_to_entry_by_seqnum_for_data(
2216                 JournalFile *f,
2217                 uint64_t data_offset,
2218                 uint64_t seqnum,
2219                 direction_t direction,
2220                 Object **ret, uint64_t *offset) {
2221
2222         Object *d;
2223         int r;
2224
2225         assert(f);
2226
2227         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2228         if (r < 0)
2229                 return r;
2230
2231         return generic_array_bisect_plus_one(f,
2232                                              le64toh(d->data.entry_offset),
2233                                              le64toh(d->data.entry_array_offset),
2234                                              le64toh(d->data.n_entries),
2235                                              seqnum,
2236                                              test_object_seqnum,
2237                                              direction,
2238                                              ret, offset, NULL);
2239 }
2240
2241 int journal_file_move_to_entry_by_realtime_for_data(
2242                 JournalFile *f,
2243                 uint64_t data_offset,
2244                 uint64_t realtime,
2245                 direction_t direction,
2246                 Object **ret, uint64_t *offset) {
2247
2248         Object *d;
2249         int r;
2250
2251         assert(f);
2252
2253         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2254         if (r < 0)
2255                 return r;
2256
2257         return generic_array_bisect_plus_one(f,
2258                                              le64toh(d->data.entry_offset),
2259                                              le64toh(d->data.entry_array_offset),
2260                                              le64toh(d->data.n_entries),
2261                                              realtime,
2262                                              test_object_realtime,
2263                                              direction,
2264                                              ret, offset, NULL);
2265 }
2266
2267 void journal_file_dump(JournalFile *f) {
2268         Object *o;
2269         int r;
2270         uint64_t p;
2271
2272         assert(f);
2273
2274         journal_file_print_header(f);
2275
2276         p = le64toh(f->header->header_size);
2277         while (p != 0) {
2278                 r = journal_file_move_to_object(f, -1, p, &o);
2279                 if (r < 0)
2280                         goto fail;
2281
2282                 switch (o->object.type) {
2283
2284                 case OBJECT_UNUSED:
2285                         printf("Type: OBJECT_UNUSED\n");
2286                         break;
2287
2288                 case OBJECT_DATA:
2289                         printf("Type: OBJECT_DATA\n");
2290                         break;
2291
2292                 case OBJECT_FIELD:
2293                         printf("Type: OBJECT_FIELD\n");
2294                         break;
2295
2296                 case OBJECT_ENTRY:
2297                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2298                                le64toh(o->entry.seqnum),
2299                                le64toh(o->entry.monotonic),
2300                                le64toh(o->entry.realtime));
2301                         break;
2302
2303                 case OBJECT_FIELD_HASH_TABLE:
2304                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2305                         break;
2306
2307                 case OBJECT_DATA_HASH_TABLE:
2308                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2309                         break;
2310
2311                 case OBJECT_ENTRY_ARRAY:
2312                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2313                         break;
2314
2315                 case OBJECT_TAG:
2316                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2317                                le64toh(o->tag.seqnum),
2318                                le64toh(o->tag.epoch));
2319                         break;
2320
2321                 default:
2322                         printf("Type: unknown (%u)\n", o->object.type);
2323                         break;
2324                 }
2325
2326                 if (o->object.flags & OBJECT_COMPRESSED)
2327                         printf("Flags: COMPRESSED\n");
2328
2329                 if (p == le64toh(f->header->tail_object_offset))
2330                         p = 0;
2331                 else
2332                         p = p + ALIGN64(le64toh(o->object.size));
2333         }
2334
2335         return;
2336 fail:
2337         log_error("File corrupt");
2338 }
2339
2340 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2341         const char *x;
2342
2343         x = format_timestamp(buf, l, t);
2344         if (x)
2345                 return x;
2346         return " --- ";
2347 }
2348
2349 void journal_file_print_header(JournalFile *f) {
2350         char a[33], b[33], c[33], d[33];
2351         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2352         struct stat st;
2353         char bytes[FORMAT_BYTES_MAX];
2354
2355         assert(f);
2356
2357         printf("File Path: %s\n"
2358                "File ID: %s\n"
2359                "Machine ID: %s\n"
2360                "Boot ID: %s\n"
2361                "Sequential Number ID: %s\n"
2362                "State: %s\n"
2363                "Compatible Flags:%s%s\n"
2364                "Incompatible Flags:%s%s\n"
2365                "Header size: %"PRIu64"\n"
2366                "Arena size: %"PRIu64"\n"
2367                "Data Hash Table Size: %"PRIu64"\n"
2368                "Field Hash Table Size: %"PRIu64"\n"
2369                "Rotate Suggested: %s\n"
2370                "Head Sequential Number: %"PRIu64"\n"
2371                "Tail Sequential Number: %"PRIu64"\n"
2372                "Head Realtime Timestamp: %s\n"
2373                "Tail Realtime Timestamp: %s\n"
2374                "Tail Monotonic Timestamp: %s\n"
2375                "Objects: %"PRIu64"\n"
2376                "Entry Objects: %"PRIu64"\n",
2377                f->path,
2378                sd_id128_to_string(f->header->file_id, a),
2379                sd_id128_to_string(f->header->machine_id, b),
2380                sd_id128_to_string(f->header->boot_id, c),
2381                sd_id128_to_string(f->header->seqnum_id, d),
2382                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2383                f->header->state == STATE_ONLINE ? "ONLINE" :
2384                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2385                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2386                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2387                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2388                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2389                le64toh(f->header->header_size),
2390                le64toh(f->header->arena_size),
2391                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2392                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2393                yes_no(journal_file_rotate_suggested(f, 0)),
2394                le64toh(f->header->head_entry_seqnum),
2395                le64toh(f->header->tail_entry_seqnum),
2396                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2397                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2398                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2399                le64toh(f->header->n_objects),
2400                le64toh(f->header->n_entries));
2401
2402         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2403                 printf("Data Objects: %"PRIu64"\n"
2404                        "Data Hash Table Fill: %.1f%%\n",
2405                        le64toh(f->header->n_data),
2406                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2407
2408         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2409                 printf("Field Objects: %"PRIu64"\n"
2410                        "Field Hash Table Fill: %.1f%%\n",
2411                        le64toh(f->header->n_fields),
2412                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2413
2414         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2415                 printf("Tag Objects: %"PRIu64"\n",
2416                        le64toh(f->header->n_tags));
2417         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2418                 printf("Entry Array Objects: %"PRIu64"\n",
2419                        le64toh(f->header->n_entry_arrays));
2420
2421         if (fstat(f->fd, &st) >= 0)
2422                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2423 }
2424
2425 int journal_file_open(
2426                 const char *fname,
2427                 int flags,
2428                 mode_t mode,
2429                 bool compress,
2430                 bool seal,
2431                 JournalMetrics *metrics,
2432                 MMapCache *mmap_cache,
2433                 JournalFile *template,
2434                 JournalFile **ret) {
2435
2436         JournalFile *f;
2437         int r;
2438         bool newly_created = false;
2439
2440         assert(fname);
2441         assert(ret);
2442
2443         if ((flags & O_ACCMODE) != O_RDONLY &&
2444             (flags & O_ACCMODE) != O_RDWR)
2445                 return -EINVAL;
2446
2447         if (!endswith(fname, ".journal") &&
2448             !endswith(fname, ".journal~"))
2449                 return -EINVAL;
2450
2451         f = new0(JournalFile, 1);
2452         if (!f)
2453                 return -ENOMEM;
2454
2455         f->fd = -1;
2456         f->mode = mode;
2457
2458         f->flags = flags;
2459         f->prot = prot_from_flags(flags);
2460         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2461 #ifdef HAVE_XZ
2462         f->compress = compress;
2463 #endif
2464 #ifdef HAVE_GCRYPT
2465         f->seal = seal;
2466 #endif
2467
2468         if (mmap_cache)
2469                 f->mmap = mmap_cache_ref(mmap_cache);
2470         else {
2471                 f->mmap = mmap_cache_new();
2472                 if (!f->mmap) {
2473                         r = -ENOMEM;
2474                         goto fail;
2475                 }
2476         }
2477
2478         f->path = strdup(fname);
2479         if (!f->path) {
2480                 r = -ENOMEM;
2481                 goto fail;
2482         }
2483
2484         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2485         if (!f->chain_cache) {
2486                 r = -ENOMEM;
2487                 goto fail;
2488         }
2489
2490         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2491         if (f->fd < 0) {
2492                 r = -errno;
2493                 goto fail;
2494         }
2495
2496         if (fstat(f->fd, &f->last_stat) < 0) {
2497                 r = -errno;
2498                 goto fail;
2499         }
2500
2501         if (f->last_stat.st_size == 0 && f->writable) {
2502 #ifdef HAVE_XATTR
2503                 uint64_t crtime;
2504
2505                 /* Let's attach the creation time to the journal file,
2506                  * so that the vacuuming code knows the age of this
2507                  * file even if the file might end up corrupted one
2508                  * day... Ideally we'd just use the creation time many
2509                  * file systems maintain for each file, but there is
2510                  * currently no usable API to query this, hence let's
2511                  * emulate this via extended attributes. If extended
2512                  * attributes are not supported we'll just skip this,
2513                  * and rely solely on mtime/atime/ctime of the file.*/
2514
2515                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2516                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2517 #endif
2518
2519 #ifdef HAVE_GCRYPT
2520                 /* Try to load the FSPRG state, and if we can't, then
2521                  * just don't do sealing */
2522                 if (f->seal) {
2523                         r = journal_file_fss_load(f);
2524                         if (r < 0)
2525                                 f->seal = false;
2526                 }
2527 #endif
2528
2529                 r = journal_file_init_header(f, template);
2530                 if (r < 0)
2531                         goto fail;
2532
2533                 if (fstat(f->fd, &f->last_stat) < 0) {
2534                         r = -errno;
2535                         goto fail;
2536                 }
2537
2538                 newly_created = true;
2539         }
2540
2541         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2542                 r = -EIO;
2543                 goto fail;
2544         }
2545
2546         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2547         if (f->header == MAP_FAILED) {
2548                 f->header = NULL;
2549                 r = -errno;
2550                 goto fail;
2551         }
2552
2553         if (!newly_created) {
2554                 r = journal_file_verify_header(f);
2555                 if (r < 0)
2556                         goto fail;
2557         }
2558
2559 #ifdef HAVE_GCRYPT
2560         if (!newly_created && f->writable) {
2561                 r = journal_file_fss_load(f);
2562                 if (r < 0)
2563                         goto fail;
2564         }
2565 #endif
2566
2567         if (f->writable) {
2568                 if (metrics) {
2569                         journal_default_metrics(metrics, f->fd);
2570                         f->metrics = *metrics;
2571                 } else if (template)
2572                         f->metrics = template->metrics;
2573
2574                 r = journal_file_refresh_header(f);
2575                 if (r < 0)
2576                         goto fail;
2577         }
2578
2579 #ifdef HAVE_GCRYPT
2580         r = journal_file_hmac_setup(f);
2581         if (r < 0)
2582                 goto fail;
2583 #endif
2584
2585         if (newly_created) {
2586                 r = journal_file_setup_field_hash_table(f);
2587                 if (r < 0)
2588                         goto fail;
2589
2590                 r = journal_file_setup_data_hash_table(f);
2591                 if (r < 0)
2592                         goto fail;
2593
2594 #ifdef HAVE_GCRYPT
2595                 r = journal_file_append_first_tag(f);
2596                 if (r < 0)
2597                         goto fail;
2598 #endif
2599         }
2600
2601         r = journal_file_map_field_hash_table(f);
2602         if (r < 0)
2603                 goto fail;
2604
2605         r = journal_file_map_data_hash_table(f);
2606         if (r < 0)
2607                 goto fail;
2608
2609         *ret = f;
2610         return 0;
2611
2612 fail:
2613         journal_file_close(f);
2614
2615         return r;
2616 }
2617
2618 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2619         _cleanup_free_ char *p = NULL;
2620         size_t l;
2621         JournalFile *old_file, *new_file = NULL;
2622         int r;
2623
2624         assert(f);
2625         assert(*f);
2626
2627         old_file = *f;
2628
2629         if (!old_file->writable)
2630                 return -EINVAL;
2631
2632         if (!endswith(old_file->path, ".journal"))
2633                 return -EINVAL;
2634
2635         l = strlen(old_file->path);
2636         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2637                      (int) l - 8, old_file->path,
2638                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2639                      le64toh((*f)->header->head_entry_seqnum),
2640                      le64toh((*f)->header->head_entry_realtime));
2641         if (r < 0)
2642                 return -ENOMEM;
2643
2644         r = rename(old_file->path, p);
2645         if (r < 0)
2646                 return -errno;
2647
2648         old_file->header->state = STATE_ARCHIVED;
2649
2650         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2651         journal_file_close(old_file);
2652
2653         *f = new_file;
2654         return r;
2655 }
2656
2657 int journal_file_open_reliably(
2658                 const char *fname,
2659                 int flags,
2660                 mode_t mode,
2661                 bool compress,
2662                 bool seal,
2663                 JournalMetrics *metrics,
2664                 MMapCache *mmap_cache,
2665                 JournalFile *template,
2666                 JournalFile **ret) {
2667
2668         int r;
2669         size_t l;
2670         _cleanup_free_ char *p = NULL;
2671
2672         r = journal_file_open(fname, flags, mode, compress, seal,
2673                               metrics, mmap_cache, template, ret);
2674         if (r != -EBADMSG && /* corrupted */
2675             r != -ENODATA && /* truncated */
2676             r != -EHOSTDOWN && /* other machine */
2677             r != -EPROTONOSUPPORT && /* incompatible feature */
2678             r != -EBUSY && /* unclean shutdown */
2679             r != -ESHUTDOWN /* already archived */)
2680                 return r;
2681
2682         if ((flags & O_ACCMODE) == O_RDONLY)
2683                 return r;
2684
2685         if (!(flags & O_CREAT))
2686                 return r;
2687
2688         if (!endswith(fname, ".journal"))
2689                 return r;
2690
2691         /* The file is corrupted. Rotate it away and try it again (but only once) */
2692
2693         l = strlen(fname);
2694         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2695                      (int) l - 8, fname,
2696                      (unsigned long long) now(CLOCK_REALTIME),
2697                      random_u64()) < 0)
2698                 return -ENOMEM;
2699
2700         r = rename(fname, p);
2701         if (r < 0)
2702                 return -errno;
2703
2704         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2705
2706         return journal_file_open(fname, flags, mode, compress, seal,
2707                                  metrics, mmap_cache, template, ret);
2708 }
2709
2710 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2711         uint64_t i, n;
2712         uint64_t q, xor_hash = 0;
2713         int r;
2714         EntryItem *items;
2715         dual_timestamp ts;
2716
2717         assert(from);
2718         assert(to);
2719         assert(o);
2720         assert(p);
2721
2722         if (!to->writable)
2723                 return -EPERM;
2724
2725         ts.monotonic = le64toh(o->entry.monotonic);
2726         ts.realtime = le64toh(o->entry.realtime);
2727
2728         n = journal_file_entry_n_items(o);
2729         /* alloca() can't take 0, hence let's allocate at least one */
2730         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2731
2732         for (i = 0; i < n; i++) {
2733                 uint64_t l, h;
2734                 le64_t le_hash;
2735                 size_t t;
2736                 void *data;
2737                 Object *u;
2738
2739                 q = le64toh(o->entry.items[i].object_offset);
2740                 le_hash = o->entry.items[i].hash;
2741
2742                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2743                 if (r < 0)
2744                         return r;
2745
2746                 if (le_hash != o->data.hash)
2747                         return -EBADMSG;
2748
2749                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2750                 t = (size_t) l;
2751
2752                 /* We hit the limit on 32bit machines */
2753                 if ((uint64_t) t != l)
2754                         return -E2BIG;
2755
2756                 if (o->object.flags & OBJECT_COMPRESSED) {
2757 #ifdef HAVE_XZ
2758                         uint64_t rsize;
2759
2760                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2761                                 return -EBADMSG;
2762
2763                         data = from->compress_buffer;
2764                         l = rsize;
2765 #else
2766                         return -EPROTONOSUPPORT;
2767 #endif
2768                 } else
2769                         data = o->data.payload;
2770
2771                 r = journal_file_append_data(to, data, l, &u, &h);
2772                 if (r < 0)
2773                         return r;
2774
2775                 xor_hash ^= le64toh(u->data.hash);
2776                 items[i].object_offset = htole64(h);
2777                 items[i].hash = u->data.hash;
2778
2779                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2780                 if (r < 0)
2781                         return r;
2782         }
2783
2784         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2785 }
2786
2787 void journal_default_metrics(JournalMetrics *m, int fd) {
2788         uint64_t fs_size = 0;
2789         struct statvfs ss;
2790         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2791
2792         assert(m);
2793         assert(fd >= 0);
2794
2795         if (fstatvfs(fd, &ss) >= 0)
2796                 fs_size = ss.f_frsize * ss.f_blocks;
2797
2798         if (m->max_use == (uint64_t) -1) {
2799
2800                 if (fs_size > 0) {
2801                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2802
2803                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2804                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2805
2806                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2807                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2808                 } else
2809                         m->max_use = DEFAULT_MAX_USE_LOWER;
2810         } else {
2811                 m->max_use = PAGE_ALIGN(m->max_use);
2812
2813                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2814                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2815         }
2816
2817         if (m->max_size == (uint64_t) -1) {
2818                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2819
2820                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2821                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2822         } else
2823                 m->max_size = PAGE_ALIGN(m->max_size);
2824
2825         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2826                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2827
2828         if (m->max_size*2 > m->max_use)
2829                 m->max_use = m->max_size*2;
2830
2831         if (m->min_size == (uint64_t) -1)
2832                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2833         else {
2834                 m->min_size = PAGE_ALIGN(m->min_size);
2835
2836                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2837                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2838
2839                 if (m->min_size > m->max_size)
2840                         m->max_size = m->min_size;
2841         }
2842
2843         if (m->keep_free == (uint64_t) -1) {
2844
2845                 if (fs_size > 0) {
2846                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2847
2848                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2849                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2850
2851                 } else
2852                         m->keep_free = DEFAULT_KEEP_FREE;
2853         }
2854
2855         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2856                   format_bytes(a, sizeof(a), m->max_use),
2857                   format_bytes(b, sizeof(b), m->max_size),
2858                   format_bytes(c, sizeof(c), m->min_size),
2859                   format_bytes(d, sizeof(d), m->keep_free));
2860 }
2861
2862 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2863         assert(f);
2864         assert(from || to);
2865
2866         if (from) {
2867                 if (f->header->head_entry_realtime == 0)
2868                         return -ENOENT;
2869
2870                 *from = le64toh(f->header->head_entry_realtime);
2871         }
2872
2873         if (to) {
2874                 if (f->header->tail_entry_realtime == 0)
2875                         return -ENOENT;
2876
2877                 *to = le64toh(f->header->tail_entry_realtime);
2878         }
2879
2880         return 1;
2881 }
2882
2883 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2884         Object *o;
2885         uint64_t p;
2886         int r;
2887
2888         assert(f);
2889         assert(from || to);
2890
2891         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2892         if (r <= 0)
2893                 return r;
2894
2895         if (le64toh(o->data.n_entries) <= 0)
2896                 return 0;
2897
2898         if (from) {
2899                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2900                 if (r < 0)
2901                         return r;
2902
2903                 *from = le64toh(o->entry.monotonic);
2904         }
2905
2906         if (to) {
2907                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2908                 if (r < 0)
2909                         return r;
2910
2911                 r = generic_array_get_plus_one(f,
2912                                                le64toh(o->data.entry_offset),
2913                                                le64toh(o->data.entry_array_offset),
2914                                                le64toh(o->data.n_entries)-1,
2915                                                &o, NULL);
2916                 if (r <= 0)
2917                         return r;
2918
2919                 *to = le64toh(o->entry.monotonic);
2920         }
2921
2922         return 1;
2923 }
2924
2925 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2926         assert(f);
2927
2928         /* If we gained new header fields we gained new features,
2929          * hence suggest a rotation */
2930         if (le64toh(f->header->header_size) < sizeof(Header)) {
2931                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2932                 return true;
2933         }
2934
2935         /* Let's check if the hash tables grew over a certain fill
2936          * level (75%, borrowing this value from Java's hash table
2937          * implementation), and if so suggest a rotation. To calculate
2938          * the fill level we need the n_data field, which only exists
2939          * in newer versions. */
2940
2941         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2942                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2943                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2944                                   f->path,
2945                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2946                                   le64toh(f->header->n_data),
2947                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2948                                   (unsigned long long) f->last_stat.st_size,
2949                                   f->last_stat.st_size / le64toh(f->header->n_data));
2950                         return true;
2951                 }
2952
2953         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2954                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2955                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2956                                   f->path,
2957                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2958                                   le64toh(f->header->n_fields),
2959                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2960                         return true;
2961                 }
2962
2963         /* Are the data objects properly indexed by field objects? */
2964         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2965             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2966             le64toh(f->header->n_data) > 0 &&
2967             le64toh(f->header->n_fields) == 0)
2968                 return true;
2969
2970         if (max_file_usec > 0) {
2971                 usec_t t, h;
2972
2973                 h = le64toh(f->header->head_entry_realtime);
2974                 t = now(CLOCK_REALTIME);
2975
2976                 if (h > 0 && t > h + max_file_usec)
2977                         return true;
2978         }
2979
2980         return false;
2981 }