chiark / gitweb /
journal: introduce entry array chain cache
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 void journal_file_close(JournalFile *f) {
72         assert(f);
73
74 #ifdef HAVE_GCRYPT
75         /* Write the final tag */
76         if (f->seal && f->writable)
77                 journal_file_append_tag(f);
78 #endif
79
80         /* Sync everything to disk, before we mark the file offline */
81         if (f->mmap && f->fd >= 0)
82                 mmap_cache_close_fd(f->mmap, f->fd);
83
84         if (f->writable && f->fd >= 0)
85                 fdatasync(f->fd);
86
87         if (f->header) {
88                 /* Mark the file offline. Don't override the archived state if it already is set */
89                 if (f->writable && f->header->state == STATE_ONLINE)
90                         f->header->state = STATE_OFFLINE;
91
92                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
93         }
94
95         if (f->fd >= 0)
96                 close_nointr_nofail(f->fd);
97
98         free(f->path);
99
100         if (f->mmap)
101                 mmap_cache_unref(f->mmap);
102
103         hashmap_free_free(f->chain_cache);
104
105 #ifdef HAVE_XZ
106         free(f->compress_buffer);
107 #endif
108
109 #ifdef HAVE_GCRYPT
110         if (f->fss_file)
111                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
112         else if (f->fsprg_state)
113                 free(f->fsprg_state);
114
115         free(f->fsprg_seed);
116
117         if (f->hmac)
118                 gcry_md_close(f->hmac);
119 #endif
120
121         free(f);
122 }
123
124 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
125         Header h;
126         ssize_t k;
127         int r;
128
129         assert(f);
130
131         zero(h);
132         memcpy(h.signature, HEADER_SIGNATURE, 8);
133         h.header_size = htole64(ALIGN64(sizeof(h)));
134
135         h.incompatible_flags =
136                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
137
138         h.compatible_flags =
139                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
140
141         r = sd_id128_randomize(&h.file_id);
142         if (r < 0)
143                 return r;
144
145         if (template) {
146                 h.seqnum_id = template->header->seqnum_id;
147                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
148         } else
149                 h.seqnum_id = h.file_id;
150
151         k = pwrite(f->fd, &h, sizeof(h), 0);
152         if (k < 0)
153                 return -errno;
154
155         if (k != sizeof(h))
156                 return -EIO;
157
158         return 0;
159 }
160
161 static int journal_file_refresh_header(JournalFile *f) {
162         int r;
163         sd_id128_t boot_id;
164
165         assert(f);
166
167         r = sd_id128_get_machine(&f->header->machine_id);
168         if (r < 0)
169                 return r;
170
171         r = sd_id128_get_boot(&boot_id);
172         if (r < 0)
173                 return r;
174
175         if (sd_id128_equal(boot_id, f->header->boot_id))
176                 f->tail_entry_monotonic_valid = true;
177
178         f->header->boot_id = boot_id;
179
180         f->header->state = STATE_ONLINE;
181
182         /* Sync the online state to disk */
183         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
184         fdatasync(f->fd);
185
186         return 0;
187 }
188
189 static int journal_file_verify_header(JournalFile *f) {
190         assert(f);
191
192         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
193                 return -EBADMSG;
194
195         /* In both read and write mode we refuse to open files with
196          * incompatible flags we don't know */
197 #ifdef HAVE_XZ
198         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
199                 return -EPROTONOSUPPORT;
200 #else
201         if (f->header->incompatible_flags != 0)
202                 return -EPROTONOSUPPORT;
203 #endif
204
205         /* When open for writing we refuse to open files with
206          * compatible flags, too */
207         if (f->writable) {
208 #ifdef HAVE_GCRYPT
209                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
210                         return -EPROTONOSUPPORT;
211 #else
212                 if (f->header->compatible_flags != 0)
213                         return -EPROTONOSUPPORT;
214 #endif
215         }
216
217         if (f->header->state >= _STATE_MAX)
218                 return -EBADMSG;
219
220         /* The first addition was n_data, so check that we are at least this large */
221         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
222                 return -EBADMSG;
223
224         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
225                 return -EBADMSG;
226
227         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
228                 return -ENODATA;
229
230         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
231                 return -ENODATA;
232
233         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
234             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
235             !VALID64(le64toh(f->header->tail_object_offset)) ||
236             !VALID64(le64toh(f->header->entry_array_offset)))
237                 return -ENODATA;
238
239         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
240             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
241             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
242             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
243                 return -ENODATA;
244
245         if (f->writable) {
246                 uint8_t state;
247                 sd_id128_t machine_id;
248                 int r;
249
250                 r = sd_id128_get_machine(&machine_id);
251                 if (r < 0)
252                         return r;
253
254                 if (!sd_id128_equal(machine_id, f->header->machine_id))
255                         return -EHOSTDOWN;
256
257                 state = f->header->state;
258
259                 if (state == STATE_ONLINE) {
260                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
261                         return -EBUSY;
262                 } else if (state == STATE_ARCHIVED)
263                         return -ESHUTDOWN;
264                 else if (state != STATE_OFFLINE) {
265                         log_debug("Journal file %s has unknown state %u.", f->path, state);
266                         return -EBUSY;
267                 }
268         }
269
270         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
271
272         f->seal = JOURNAL_HEADER_SEALED(f->header);
273
274         return 0;
275 }
276
277 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
278         uint64_t old_size, new_size;
279         int r;
280
281         assert(f);
282
283         /* We assume that this file is not sparse, and we know that
284          * for sure, since we always call posix_fallocate()
285          * ourselves */
286
287         old_size =
288                 le64toh(f->header->header_size) +
289                 le64toh(f->header->arena_size);
290
291         new_size = PAGE_ALIGN(offset + size);
292         if (new_size < le64toh(f->header->header_size))
293                 new_size = le64toh(f->header->header_size);
294
295         if (new_size <= old_size)
296                 return 0;
297
298         if (f->metrics.max_size > 0 &&
299             new_size > f->metrics.max_size)
300                 return -E2BIG;
301
302         if (new_size > f->metrics.min_size &&
303             f->metrics.keep_free > 0) {
304                 struct statvfs svfs;
305
306                 if (fstatvfs(f->fd, &svfs) >= 0) {
307                         uint64_t available;
308
309                         available = svfs.f_bfree * svfs.f_bsize;
310
311                         if (available >= f->metrics.keep_free)
312                                 available -= f->metrics.keep_free;
313                         else
314                                 available = 0;
315
316                         if (new_size - old_size > available)
317                                 return -E2BIG;
318                 }
319         }
320
321         /* Note that the glibc fallocate() fallback is very
322            inefficient, hence we try to minimize the allocation area
323            as we can. */
324         r = posix_fallocate(f->fd, old_size, new_size - old_size);
325         if (r != 0)
326                 return -r;
327
328         if (fstat(f->fd, &f->last_stat) < 0)
329                 return -errno;
330
331         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
332
333         return 0;
334 }
335
336 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
337         assert(f);
338         assert(ret);
339
340         if (size <= 0)
341                 return -EINVAL;
342
343         /* Avoid SIGBUS on invalid accesses */
344         if (offset + size > (uint64_t) f->last_stat.st_size) {
345                 /* Hmm, out of range? Let's refresh the fstat() data
346                  * first, before we trust that check. */
347
348                 if (fstat(f->fd, &f->last_stat) < 0 ||
349                     offset + size > (uint64_t) f->last_stat.st_size)
350                         return -EADDRNOTAVAIL;
351         }
352
353         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
354 }
355
356 static uint64_t minimum_header_size(Object *o) {
357
358         static uint64_t table[] = {
359                 [OBJECT_DATA] = sizeof(DataObject),
360                 [OBJECT_FIELD] = sizeof(FieldObject),
361                 [OBJECT_ENTRY] = sizeof(EntryObject),
362                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
363                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
364                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
365                 [OBJECT_TAG] = sizeof(TagObject),
366         };
367
368         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
369                 return sizeof(ObjectHeader);
370
371         return table[o->object.type];
372 }
373
374 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
375         int r;
376         void *t;
377         Object *o;
378         uint64_t s;
379         unsigned context;
380
381         assert(f);
382         assert(ret);
383
384         /* Objects may only be located at multiple of 64 bit */
385         if (!VALID64(offset))
386                 return -EFAULT;
387
388         /* One context for each type, plus one catch-all for the rest */
389         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
390
391         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
392         if (r < 0)
393                 return r;
394
395         o = (Object*) t;
396         s = le64toh(o->object.size);
397
398         if (s < sizeof(ObjectHeader))
399                 return -EBADMSG;
400
401         if (o->object.type <= OBJECT_UNUSED)
402                 return -EBADMSG;
403
404         if (s < minimum_header_size(o))
405                 return -EBADMSG;
406
407         if (type > 0 && o->object.type != type)
408                 return -EBADMSG;
409
410         if (s > sizeof(ObjectHeader)) {
411                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
412                 if (r < 0)
413                         return r;
414
415                 o = (Object*) t;
416         }
417
418         *ret = o;
419         return 0;
420 }
421
422 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
423         uint64_t r;
424
425         assert(f);
426
427         r = le64toh(f->header->tail_entry_seqnum) + 1;
428
429         if (seqnum) {
430                 /* If an external seqnum counter was passed, we update
431                  * both the local and the external one, and set it to
432                  * the maximum of both */
433
434                 if (*seqnum + 1 > r)
435                         r = *seqnum + 1;
436
437                 *seqnum = r;
438         }
439
440         f->header->tail_entry_seqnum = htole64(r);
441
442         if (f->header->head_entry_seqnum == 0)
443                 f->header->head_entry_seqnum = htole64(r);
444
445         return r;
446 }
447
448 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
449         int r;
450         uint64_t p;
451         Object *tail, *o;
452         void *t;
453
454         assert(f);
455         assert(type > 0 && type < _OBJECT_TYPE_MAX);
456         assert(size >= sizeof(ObjectHeader));
457         assert(offset);
458         assert(ret);
459
460         p = le64toh(f->header->tail_object_offset);
461         if (p == 0)
462                 p = le64toh(f->header->header_size);
463         else {
464                 r = journal_file_move_to_object(f, -1, p, &tail);
465                 if (r < 0)
466                         return r;
467
468                 p += ALIGN64(le64toh(tail->object.size));
469         }
470
471         r = journal_file_allocate(f, p, size);
472         if (r < 0)
473                 return r;
474
475         r = journal_file_move_to(f, type, false, p, size, &t);
476         if (r < 0)
477                 return r;
478
479         o = (Object*) t;
480
481         zero(o->object);
482         o->object.type = type;
483         o->object.size = htole64(size);
484
485         f->header->tail_object_offset = htole64(p);
486         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
487
488         *ret = o;
489         *offset = p;
490
491         return 0;
492 }
493
494 static int journal_file_setup_data_hash_table(JournalFile *f) {
495         uint64_t s, p;
496         Object *o;
497         int r;
498
499         assert(f);
500
501         /* We estimate that we need 1 hash table entry per 768 of
502            journal file and we want to make sure we never get beyond
503            75% fill level. Calculate the hash table size for the
504            maximum file size based on these metrics. */
505
506         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
507         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
508                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
509
510         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
511
512         r = journal_file_append_object(f,
513                                        OBJECT_DATA_HASH_TABLE,
514                                        offsetof(Object, hash_table.items) + s,
515                                        &o, &p);
516         if (r < 0)
517                 return r;
518
519         memset(o->hash_table.items, 0, s);
520
521         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
522         f->header->data_hash_table_size = htole64(s);
523
524         return 0;
525 }
526
527 static int journal_file_setup_field_hash_table(JournalFile *f) {
528         uint64_t s, p;
529         Object *o;
530         int r;
531
532         assert(f);
533
534         /* We use a fixed size hash table for the fields as this
535          * number should grow very slowly only */
536
537         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
538         r = journal_file_append_object(f,
539                                        OBJECT_FIELD_HASH_TABLE,
540                                        offsetof(Object, hash_table.items) + s,
541                                        &o, &p);
542         if (r < 0)
543                 return r;
544
545         memset(o->hash_table.items, 0, s);
546
547         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
548         f->header->field_hash_table_size = htole64(s);
549
550         return 0;
551 }
552
553 static int journal_file_map_data_hash_table(JournalFile *f) {
554         uint64_t s, p;
555         void *t;
556         int r;
557
558         assert(f);
559
560         p = le64toh(f->header->data_hash_table_offset);
561         s = le64toh(f->header->data_hash_table_size);
562
563         r = journal_file_move_to(f,
564                                  OBJECT_DATA_HASH_TABLE,
565                                  true,
566                                  p, s,
567                                  &t);
568         if (r < 0)
569                 return r;
570
571         f->data_hash_table = t;
572         return 0;
573 }
574
575 static int journal_file_map_field_hash_table(JournalFile *f) {
576         uint64_t s, p;
577         void *t;
578         int r;
579
580         assert(f);
581
582         p = le64toh(f->header->field_hash_table_offset);
583         s = le64toh(f->header->field_hash_table_size);
584
585         r = journal_file_move_to(f,
586                                  OBJECT_FIELD_HASH_TABLE,
587                                  true,
588                                  p, s,
589                                  &t);
590         if (r < 0)
591                 return r;
592
593         f->field_hash_table = t;
594         return 0;
595 }
596
597 static int journal_file_link_field(
598                 JournalFile *f,
599                 Object *o,
600                 uint64_t offset,
601                 uint64_t hash) {
602
603         uint64_t p, h;
604         int r;
605
606         assert(f);
607         assert(o);
608         assert(offset > 0);
609
610         if (o->object.type != OBJECT_FIELD)
611                 return -EINVAL;
612
613         /* This might alter the window we are looking at */
614
615         o->field.next_hash_offset = o->field.head_data_offset = 0;
616
617         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
618         p = le64toh(f->field_hash_table[h].tail_hash_offset);
619         if (p == 0)
620                 f->field_hash_table[h].head_hash_offset = htole64(offset);
621         else {
622                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
623                 if (r < 0)
624                         return r;
625
626                 o->field.next_hash_offset = htole64(offset);
627         }
628
629         f->field_hash_table[h].tail_hash_offset = htole64(offset);
630
631         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
632                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
633
634         return 0;
635 }
636
637 static int journal_file_link_data(
638                 JournalFile *f,
639                 Object *o,
640                 uint64_t offset,
641                 uint64_t hash) {
642
643         uint64_t p, h;
644         int r;
645
646         assert(f);
647         assert(o);
648         assert(offset > 0);
649
650         if (o->object.type != OBJECT_DATA)
651                 return -EINVAL;
652
653         /* This might alter the window we are looking at */
654
655         o->data.next_hash_offset = o->data.next_field_offset = 0;
656         o->data.entry_offset = o->data.entry_array_offset = 0;
657         o->data.n_entries = 0;
658
659         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->data_hash_table[h].tail_hash_offset);
661         if (p == 0)
662                 /* Only entry in the hash table is easy */
663                 f->data_hash_table[h].head_hash_offset = htole64(offset);
664         else {
665                 /* Move back to the previous data object, to patch in
666                  * pointer */
667
668                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
669                 if (r < 0)
670                         return r;
671
672                 o->data.next_hash_offset = htole64(offset);
673         }
674
675         f->data_hash_table[h].tail_hash_offset = htole64(offset);
676
677         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
678                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
679
680         return 0;
681 }
682
683 int journal_file_find_field_object_with_hash(
684                 JournalFile *f,
685                 const void *field, uint64_t size, uint64_t hash,
686                 Object **ret, uint64_t *offset) {
687
688         uint64_t p, osize, h;
689         int r;
690
691         assert(f);
692         assert(field && size > 0);
693
694         osize = offsetof(Object, field.payload) + size;
695
696         if (f->header->field_hash_table_size == 0)
697                 return -EBADMSG;
698
699         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
700         p = le64toh(f->field_hash_table[h].head_hash_offset);
701
702         while (p > 0) {
703                 Object *o;
704
705                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
706                 if (r < 0)
707                         return r;
708
709                 if (le64toh(o->field.hash) == hash &&
710                     le64toh(o->object.size) == osize &&
711                     memcmp(o->field.payload, field, size) == 0) {
712
713                         if (ret)
714                                 *ret = o;
715                         if (offset)
716                                 *offset = p;
717
718                         return 1;
719                 }
720
721                 p = le64toh(o->field.next_hash_offset);
722         }
723
724         return 0;
725 }
726
727 int journal_file_find_field_object(
728                 JournalFile *f,
729                 const void *field, uint64_t size,
730                 Object **ret, uint64_t *offset) {
731
732         uint64_t hash;
733
734         assert(f);
735         assert(field && size > 0);
736
737         hash = hash64(field, size);
738
739         return journal_file_find_field_object_with_hash(f,
740                                                         field, size, hash,
741                                                         ret, offset);
742 }
743
744 int journal_file_find_data_object_with_hash(
745                 JournalFile *f,
746                 const void *data, uint64_t size, uint64_t hash,
747                 Object **ret, uint64_t *offset) {
748
749         uint64_t p, osize, h;
750         int r;
751
752         assert(f);
753         assert(data || size == 0);
754
755         osize = offsetof(Object, data.payload) + size;
756
757         if (f->header->data_hash_table_size == 0)
758                 return -EBADMSG;
759
760         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
761         p = le64toh(f->data_hash_table[h].head_hash_offset);
762
763         while (p > 0) {
764                 Object *o;
765
766                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
767                 if (r < 0)
768                         return r;
769
770                 if (le64toh(o->data.hash) != hash)
771                         goto next;
772
773                 if (o->object.flags & OBJECT_COMPRESSED) {
774 #ifdef HAVE_XZ
775                         uint64_t l, rsize;
776
777                         l = le64toh(o->object.size);
778                         if (l <= offsetof(Object, data.payload))
779                                 return -EBADMSG;
780
781                         l -= offsetof(Object, data.payload);
782
783                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
784                                 return -EBADMSG;
785
786                         if (rsize == size &&
787                             memcmp(f->compress_buffer, data, size) == 0) {
788
789                                 if (ret)
790                                         *ret = o;
791
792                                 if (offset)
793                                         *offset = p;
794
795                                 return 1;
796                         }
797 #else
798                         return -EPROTONOSUPPORT;
799 #endif
800
801                 } else if (le64toh(o->object.size) == osize &&
802                            memcmp(o->data.payload, data, size) == 0) {
803
804                         if (ret)
805                                 *ret = o;
806
807                         if (offset)
808                                 *offset = p;
809
810                         return 1;
811                 }
812
813         next:
814                 p = le64toh(o->data.next_hash_offset);
815         }
816
817         return 0;
818 }
819
820 int journal_file_find_data_object(
821                 JournalFile *f,
822                 const void *data, uint64_t size,
823                 Object **ret, uint64_t *offset) {
824
825         uint64_t hash;
826
827         assert(f);
828         assert(data || size == 0);
829
830         hash = hash64(data, size);
831
832         return journal_file_find_data_object_with_hash(f,
833                                                        data, size, hash,
834                                                        ret, offset);
835 }
836
837 static int journal_file_append_field(
838                 JournalFile *f,
839                 const void *field, uint64_t size,
840                 Object **ret, uint64_t *offset) {
841
842         uint64_t hash, p;
843         uint64_t osize;
844         Object *o;
845         int r;
846
847         assert(f);
848         assert(field && size > 0);
849
850         hash = hash64(field, size);
851
852         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
853         if (r < 0)
854                 return r;
855         else if (r > 0) {
856
857                 if (ret)
858                         *ret = o;
859
860                 if (offset)
861                         *offset = p;
862
863                 return 0;
864         }
865
866         osize = offsetof(Object, field.payload) + size;
867         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
868
869         o->field.hash = htole64(hash);
870         memcpy(o->field.payload, field, size);
871
872         r = journal_file_link_field(f, o, p, hash);
873         if (r < 0)
874                 return r;
875
876         /* The linking might have altered the window, so let's
877          * refresh our pointer */
878         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
879         if (r < 0)
880                 return r;
881
882 #ifdef HAVE_GCRYPT
883         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
884         if (r < 0)
885                 return r;
886 #endif
887
888         if (ret)
889                 *ret = o;
890
891         if (offset)
892                 *offset = p;
893
894         return 0;
895 }
896
897 static int journal_file_append_data(
898                 JournalFile *f,
899                 const void *data, uint64_t size,
900                 Object **ret, uint64_t *offset) {
901
902         uint64_t hash, p;
903         uint64_t osize;
904         Object *o;
905         int r;
906         bool compressed = false;
907         const void *eq;
908
909         assert(f);
910         assert(data || size == 0);
911
912         hash = hash64(data, size);
913
914         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
915         if (r < 0)
916                 return r;
917         else if (r > 0) {
918
919                 if (ret)
920                         *ret = o;
921
922                 if (offset)
923                         *offset = p;
924
925                 return 0;
926         }
927
928         osize = offsetof(Object, data.payload) + size;
929         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
930         if (r < 0)
931                 return r;
932
933         o->data.hash = htole64(hash);
934
935 #ifdef HAVE_XZ
936         if (f->compress &&
937             size >= COMPRESSION_SIZE_THRESHOLD) {
938                 uint64_t rsize;
939
940                 compressed = compress_blob(data, size, o->data.payload, &rsize);
941
942                 if (compressed) {
943                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
944                         o->object.flags |= OBJECT_COMPRESSED;
945
946                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
947                 }
948         }
949 #endif
950
951         if (!compressed && size > 0)
952                 memcpy(o->data.payload, data, size);
953
954         r = journal_file_link_data(f, o, p, hash);
955         if (r < 0)
956                 return r;
957
958         /* The linking might have altered the window, so let's
959          * refresh our pointer */
960         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
961         if (r < 0)
962                 return r;
963
964         eq = memchr(data, '=', size);
965         if (eq && eq > data) {
966                 uint64_t fp;
967                 Object *fo;
968
969                 /* Create field object ... */
970                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
971                 if (r < 0)
972                         return r;
973
974                 /* ... and link it in. */
975                 o->data.next_field_offset = fo->field.head_data_offset;
976                 fo->field.head_data_offset = le64toh(p);
977         }
978
979 #ifdef HAVE_GCRYPT
980         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
981         if (r < 0)
982                 return r;
983 #endif
984
985         if (ret)
986                 *ret = o;
987
988         if (offset)
989                 *offset = p;
990
991         return 0;
992 }
993
994 uint64_t journal_file_entry_n_items(Object *o) {
995         assert(o);
996
997         if (o->object.type != OBJECT_ENTRY)
998                 return 0;
999
1000         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1001 }
1002
1003 uint64_t journal_file_entry_array_n_items(Object *o) {
1004         assert(o);
1005
1006         if (o->object.type != OBJECT_ENTRY_ARRAY)
1007                 return 0;
1008
1009         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1010 }
1011
1012 uint64_t journal_file_hash_table_n_items(Object *o) {
1013         assert(o);
1014
1015         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1016             o->object.type != OBJECT_FIELD_HASH_TABLE)
1017                 return 0;
1018
1019         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1020 }
1021
1022 static int link_entry_into_array(JournalFile *f,
1023                                  le64_t *first,
1024                                  le64_t *idx,
1025                                  uint64_t p) {
1026         int r;
1027         uint64_t n = 0, ap = 0, q, i, a, hidx;
1028         Object *o;
1029
1030         assert(f);
1031         assert(first);
1032         assert(idx);
1033         assert(p > 0);
1034
1035         a = le64toh(*first);
1036         i = hidx = le64toh(*idx);
1037         while (a > 0) {
1038
1039                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1040                 if (r < 0)
1041                         return r;
1042
1043                 n = journal_file_entry_array_n_items(o);
1044                 if (i < n) {
1045                         o->entry_array.items[i] = htole64(p);
1046                         *idx = htole64(hidx + 1);
1047                         return 0;
1048                 }
1049
1050                 i -= n;
1051                 ap = a;
1052                 a = le64toh(o->entry_array.next_entry_array_offset);
1053         }
1054
1055         if (hidx > n)
1056                 n = (hidx+1) * 2;
1057         else
1058                 n = n * 2;
1059
1060         if (n < 4)
1061                 n = 4;
1062
1063         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1064                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1065                                        &o, &q);
1066         if (r < 0)
1067                 return r;
1068
1069 #ifdef HAVE_GCRYPT
1070         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1071         if (r < 0)
1072                 return r;
1073 #endif
1074
1075         o->entry_array.items[i] = htole64(p);
1076
1077         if (ap == 0)
1078                 *first = htole64(q);
1079         else {
1080                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1081                 if (r < 0)
1082                         return r;
1083
1084                 o->entry_array.next_entry_array_offset = htole64(q);
1085         }
1086
1087         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1088                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1089
1090         *idx = htole64(hidx + 1);
1091
1092         return 0;
1093 }
1094
1095 static int link_entry_into_array_plus_one(JournalFile *f,
1096                                           le64_t *extra,
1097                                           le64_t *first,
1098                                           le64_t *idx,
1099                                           uint64_t p) {
1100
1101         int r;
1102
1103         assert(f);
1104         assert(extra);
1105         assert(first);
1106         assert(idx);
1107         assert(p > 0);
1108
1109         if (*idx == 0)
1110                 *extra = htole64(p);
1111         else {
1112                 le64_t i;
1113
1114                 i = htole64(le64toh(*idx) - 1);
1115                 r = link_entry_into_array(f, first, &i, p);
1116                 if (r < 0)
1117                         return r;
1118         }
1119
1120         *idx = htole64(le64toh(*idx) + 1);
1121         return 0;
1122 }
1123
1124 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1125         uint64_t p;
1126         int r;
1127         assert(f);
1128         assert(o);
1129         assert(offset > 0);
1130
1131         p = le64toh(o->entry.items[i].object_offset);
1132         if (p == 0)
1133                 return -EINVAL;
1134
1135         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1136         if (r < 0)
1137                 return r;
1138
1139         return link_entry_into_array_plus_one(f,
1140                                               &o->data.entry_offset,
1141                                               &o->data.entry_array_offset,
1142                                               &o->data.n_entries,
1143                                               offset);
1144 }
1145
1146 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1147         uint64_t n, i;
1148         int r;
1149
1150         assert(f);
1151         assert(o);
1152         assert(offset > 0);
1153
1154         if (o->object.type != OBJECT_ENTRY)
1155                 return -EINVAL;
1156
1157         __sync_synchronize();
1158
1159         /* Link up the entry itself */
1160         r = link_entry_into_array(f,
1161                                   &f->header->entry_array_offset,
1162                                   &f->header->n_entries,
1163                                   offset);
1164         if (r < 0)
1165                 return r;
1166
1167         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
1168
1169         if (f->header->head_entry_realtime == 0)
1170                 f->header->head_entry_realtime = o->entry.realtime;
1171
1172         f->header->tail_entry_realtime = o->entry.realtime;
1173         f->header->tail_entry_monotonic = o->entry.monotonic;
1174
1175         f->tail_entry_monotonic_valid = true;
1176
1177         /* Link up the items */
1178         n = journal_file_entry_n_items(o);
1179         for (i = 0; i < n; i++) {
1180                 r = journal_file_link_entry_item(f, o, offset, i);
1181                 if (r < 0)
1182                         return r;
1183         }
1184
1185         return 0;
1186 }
1187
1188 static int journal_file_append_entry_internal(
1189                 JournalFile *f,
1190                 const dual_timestamp *ts,
1191                 uint64_t xor_hash,
1192                 const EntryItem items[], unsigned n_items,
1193                 uint64_t *seqnum,
1194                 Object **ret, uint64_t *offset) {
1195         uint64_t np;
1196         uint64_t osize;
1197         Object *o;
1198         int r;
1199
1200         assert(f);
1201         assert(items || n_items == 0);
1202         assert(ts);
1203
1204         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1205
1206         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1207         if (r < 0)
1208                 return r;
1209
1210         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1211         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1212         o->entry.realtime = htole64(ts->realtime);
1213         o->entry.monotonic = htole64(ts->monotonic);
1214         o->entry.xor_hash = htole64(xor_hash);
1215         o->entry.boot_id = f->header->boot_id;
1216
1217 #ifdef HAVE_GCRYPT
1218         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1219         if (r < 0)
1220                 return r;
1221 #endif
1222
1223         r = journal_file_link_entry(f, o, np);
1224         if (r < 0)
1225                 return r;
1226
1227         if (ret)
1228                 *ret = o;
1229
1230         if (offset)
1231                 *offset = np;
1232
1233         return 0;
1234 }
1235
1236 void journal_file_post_change(JournalFile *f) {
1237         assert(f);
1238
1239         /* inotify() does not receive IN_MODIFY events from file
1240          * accesses done via mmap(). After each access we hence
1241          * trigger IN_MODIFY by truncating the journal file to its
1242          * current size which triggers IN_MODIFY. */
1243
1244         __sync_synchronize();
1245
1246         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1247                 log_error("Failed to truncate file to its own size: %m");
1248 }
1249
1250 static int entry_item_cmp(const void *_a, const void *_b) {
1251         const EntryItem *a = _a, *b = _b;
1252
1253         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1254                 return -1;
1255         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1256                 return 1;
1257         return 0;
1258 }
1259
1260 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1261         unsigned i;
1262         EntryItem *items;
1263         int r;
1264         uint64_t xor_hash = 0;
1265         struct dual_timestamp _ts;
1266
1267         assert(f);
1268         assert(iovec || n_iovec == 0);
1269
1270         if (!f->writable)
1271                 return -EPERM;
1272
1273         if (!ts) {
1274                 dual_timestamp_get(&_ts);
1275                 ts = &_ts;
1276         }
1277
1278         if (f->tail_entry_monotonic_valid &&
1279             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1280                 return -EINVAL;
1281
1282 #ifdef HAVE_GCRYPT
1283         r = journal_file_maybe_append_tag(f, ts->realtime);
1284         if (r < 0)
1285                 return r;
1286 #endif
1287
1288         /* alloca() can't take 0, hence let's allocate at least one */
1289         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1290
1291         for (i = 0; i < n_iovec; i++) {
1292                 uint64_t p;
1293                 Object *o;
1294
1295                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1296                 if (r < 0)
1297                         return r;
1298
1299                 xor_hash ^= le64toh(o->data.hash);
1300                 items[i].object_offset = htole64(p);
1301                 items[i].hash = o->data.hash;
1302         }
1303
1304         /* Order by the position on disk, in order to improve seek
1305          * times for rotating media. */
1306         qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1307
1308         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1309
1310         journal_file_post_change(f);
1311
1312         return r;
1313 }
1314
1315 typedef struct ChainCacheItem {
1316         uint64_t first; /* the array at the begin of the chain */
1317         uint64_t array; /* the cached array */
1318         uint64_t begin; /* the first item in the cached array */
1319         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1320 } ChainCacheItem;
1321
1322 static void chain_cache_put(
1323                 Hashmap *h,
1324                 ChainCacheItem *ci,
1325                 uint64_t first,
1326                 uint64_t array,
1327                 uint64_t begin,
1328                 uint64_t total) {
1329
1330         if (!ci) {
1331                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1332                         ci = hashmap_steal_first(h);
1333                 else {
1334                         ci = new(ChainCacheItem, 1);
1335                         if (!ci)
1336                                 return;
1337                 }
1338
1339                 ci->first = first;
1340
1341                 if (hashmap_put(h, &ci->first, ci) < 0) {
1342                         free(ci);
1343                         return;
1344                 }
1345         } else
1346                 assert(ci->first == first);
1347
1348         ci->array = array;
1349         ci->begin = begin;
1350         ci->total = total;
1351 }
1352
1353 static int generic_array_get(JournalFile *f,
1354                              uint64_t first,
1355                              uint64_t i,
1356                              Object **ret, uint64_t *offset) {
1357
1358         Object *o;
1359         uint64_t p = 0, a, t = 0;
1360         int r;
1361         ChainCacheItem *ci;
1362
1363         assert(f);
1364
1365         a = first;
1366
1367         /* Try the chain cache first */
1368         ci = hashmap_get(f->chain_cache, &first);
1369         if (ci && i > ci->total) {
1370                 a = ci->array;
1371                 i -= ci->total;
1372                 t = ci->total;
1373         }
1374
1375         while (a > 0) {
1376                 uint64_t k;
1377
1378                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1379                 if (r < 0)
1380                         return r;
1381
1382                 k = journal_file_entry_array_n_items(o);
1383                 if (i < k) {
1384                         p = le64toh(o->entry_array.items[i]);
1385                         goto found;
1386                 }
1387
1388                 i -= k;
1389                 t += k;
1390                 a = le64toh(o->entry_array.next_entry_array_offset);
1391         }
1392
1393         return 0;
1394
1395 found:
1396         /* Let's cache this item for the next invocation */
1397         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1398
1399         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1400         if (r < 0)
1401                 return r;
1402
1403         if (ret)
1404                 *ret = o;
1405
1406         if (offset)
1407                 *offset = p;
1408
1409         return 1;
1410 }
1411
1412 static int generic_array_get_plus_one(JournalFile *f,
1413                                       uint64_t extra,
1414                                       uint64_t first,
1415                                       uint64_t i,
1416                                       Object **ret, uint64_t *offset) {
1417
1418         Object *o;
1419
1420         assert(f);
1421
1422         if (i == 0) {
1423                 int r;
1424
1425                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1426                 if (r < 0)
1427                         return r;
1428
1429                 if (ret)
1430                         *ret = o;
1431
1432                 if (offset)
1433                         *offset = extra;
1434
1435                 return 1;
1436         }
1437
1438         return generic_array_get(f, first, i-1, ret, offset);
1439 }
1440
1441 enum {
1442         TEST_FOUND,
1443         TEST_LEFT,
1444         TEST_RIGHT
1445 };
1446
1447 static int generic_array_bisect(JournalFile *f,
1448                                 uint64_t first,
1449                                 uint64_t n,
1450                                 uint64_t needle,
1451                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1452                                 direction_t direction,
1453                                 Object **ret,
1454                                 uint64_t *offset,
1455                                 uint64_t *idx) {
1456
1457         uint64_t a, p, t = 0, i = 0, last_p = 0;
1458         bool subtract_one = false;
1459         Object *o, *array = NULL;
1460         int r;
1461         ChainCacheItem *ci;
1462
1463         assert(f);
1464         assert(test_object);
1465
1466         /* Start with the first array in the chain */
1467         a = first;
1468
1469         ci = hashmap_get(f->chain_cache, &first);
1470         if (ci && n > ci->total) {
1471                 /* Ah, we have iterated this bisection array chain
1472                  * previously! Let's see if we can skip ahead in the
1473                  * chain, as far as the last time. But we can't jump
1474                  * backwards in the chain, so let's check that
1475                  * first. */
1476
1477                 r = test_object(f, ci->begin, needle);
1478                 if (r < 0)
1479                         return r;
1480
1481                 if (r == TEST_LEFT) {
1482                         /* OK, what we are looking for is right of th
1483                          * begin of this EntryArray, so let's jump
1484                          * straight to previously cached array in the
1485                          * chain */
1486
1487                         a = ci->array;
1488                         n -= ci->total;
1489                         t = ci->total;
1490                 }
1491         }
1492
1493         while (a > 0) {
1494                 uint64_t left, right, k, lp;
1495
1496                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1497                 if (r < 0)
1498                         return r;
1499
1500                 k = journal_file_entry_array_n_items(array);
1501                 right = MIN(k, n);
1502                 if (right <= 0)
1503                         return 0;
1504
1505                 i = right - 1;
1506                 lp = p = le64toh(array->entry_array.items[i]);
1507                 if (p <= 0)
1508                         return -EBADMSG;
1509
1510                 r = test_object(f, p, needle);
1511                 if (r < 0)
1512                         return r;
1513
1514                 if (r == TEST_FOUND)
1515                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1516
1517                 if (r == TEST_RIGHT) {
1518                         left = 0;
1519                         right -= 1;
1520                         for (;;) {
1521                                 if (left == right) {
1522                                         if (direction == DIRECTION_UP)
1523                                                 subtract_one = true;
1524
1525                                         i = left;
1526                                         goto found;
1527                                 }
1528
1529                                 assert(left < right);
1530
1531                                 i = (left + right) / 2;
1532                                 p = le64toh(array->entry_array.items[i]);
1533                                 if (p <= 0)
1534                                         return -EBADMSG;
1535
1536                                 r = test_object(f, p, needle);
1537                                 if (r < 0)
1538                                         return r;
1539
1540                                 if (r == TEST_FOUND)
1541                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1542
1543                                 if (r == TEST_RIGHT)
1544                                         right = i;
1545                                 else
1546                                         left = i + 1;
1547                         }
1548                 }
1549
1550                 if (k > n) {
1551                         if (direction == DIRECTION_UP) {
1552                                 i = n;
1553                                 subtract_one = true;
1554                                 goto found;
1555                         }
1556
1557                         return 0;
1558                 }
1559
1560                 last_p = lp;
1561
1562                 n -= k;
1563                 t += k;
1564                 a = le64toh(array->entry_array.next_entry_array_offset);
1565         }
1566
1567         return 0;
1568
1569 found:
1570         if (subtract_one && t == 0 && i == 0)
1571                 return 0;
1572
1573         /* Let's cache this item for the next invocation */
1574         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1575
1576         if (subtract_one && i == 0)
1577                 p = last_p;
1578         else if (subtract_one)
1579                 p = le64toh(array->entry_array.items[i-1]);
1580         else
1581                 p = le64toh(array->entry_array.items[i]);
1582
1583         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1584         if (r < 0)
1585                 return r;
1586
1587         if (ret)
1588                 *ret = o;
1589
1590         if (offset)
1591                 *offset = p;
1592
1593         if (idx)
1594                 *idx = t + i + (subtract_one ? -1 : 0);
1595
1596         return 1;
1597 }
1598
1599 static int generic_array_bisect_plus_one(JournalFile *f,
1600                                          uint64_t extra,
1601                                          uint64_t first,
1602                                          uint64_t n,
1603                                          uint64_t needle,
1604                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1605                                          direction_t direction,
1606                                          Object **ret,
1607                                          uint64_t *offset,
1608                                          uint64_t *idx) {
1609
1610         int r;
1611         bool step_back = false;
1612         Object *o;
1613
1614         assert(f);
1615         assert(test_object);
1616
1617         if (n <= 0)
1618                 return 0;
1619
1620         /* This bisects the array in object 'first', but first checks
1621          * an extra  */
1622         r = test_object(f, extra, needle);
1623         if (r < 0)
1624                 return r;
1625
1626         if (r == TEST_FOUND)
1627                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1628
1629         /* if we are looking with DIRECTION_UP then we need to first
1630            see if in the actual array there is a matching entry, and
1631            return the last one of that. But if there isn't any we need
1632            to return this one. Hence remember this, and return it
1633            below. */
1634         if (r == TEST_LEFT)
1635                 step_back = direction == DIRECTION_UP;
1636
1637         if (r == TEST_RIGHT) {
1638                 if (direction == DIRECTION_DOWN)
1639                         goto found;
1640                 else
1641                         return 0;
1642         }
1643
1644         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1645
1646         if (r == 0 && step_back)
1647                 goto found;
1648
1649         if (r > 0 && idx)
1650                 (*idx) ++;
1651
1652         return r;
1653
1654 found:
1655         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1656         if (r < 0)
1657                 return r;
1658
1659         if (ret)
1660                 *ret = o;
1661
1662         if (offset)
1663                 *offset = extra;
1664
1665         if (idx)
1666                 *idx = 0;
1667
1668         return 1;
1669 }
1670
1671 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1672         assert(f);
1673         assert(p > 0);
1674
1675         if (p == needle)
1676                 return TEST_FOUND;
1677         else if (p < needle)
1678                 return TEST_LEFT;
1679         else
1680                 return TEST_RIGHT;
1681 }
1682
1683 int journal_file_move_to_entry_by_offset(
1684                 JournalFile *f,
1685                 uint64_t p,
1686                 direction_t direction,
1687                 Object **ret,
1688                 uint64_t *offset) {
1689
1690         return generic_array_bisect(f,
1691                                     le64toh(f->header->entry_array_offset),
1692                                     le64toh(f->header->n_entries),
1693                                     p,
1694                                     test_object_offset,
1695                                     direction,
1696                                     ret, offset, NULL);
1697 }
1698
1699
1700 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1701         Object *o;
1702         int r;
1703
1704         assert(f);
1705         assert(p > 0);
1706
1707         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1708         if (r < 0)
1709                 return r;
1710
1711         if (le64toh(o->entry.seqnum) == needle)
1712                 return TEST_FOUND;
1713         else if (le64toh(o->entry.seqnum) < needle)
1714                 return TEST_LEFT;
1715         else
1716                 return TEST_RIGHT;
1717 }
1718
1719 int journal_file_move_to_entry_by_seqnum(
1720                 JournalFile *f,
1721                 uint64_t seqnum,
1722                 direction_t direction,
1723                 Object **ret,
1724                 uint64_t *offset) {
1725
1726         return generic_array_bisect(f,
1727                                     le64toh(f->header->entry_array_offset),
1728                                     le64toh(f->header->n_entries),
1729                                     seqnum,
1730                                     test_object_seqnum,
1731                                     direction,
1732                                     ret, offset, NULL);
1733 }
1734
1735 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1736         Object *o;
1737         int r;
1738
1739         assert(f);
1740         assert(p > 0);
1741
1742         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1743         if (r < 0)
1744                 return r;
1745
1746         if (le64toh(o->entry.realtime) == needle)
1747                 return TEST_FOUND;
1748         else if (le64toh(o->entry.realtime) < needle)
1749                 return TEST_LEFT;
1750         else
1751                 return TEST_RIGHT;
1752 }
1753
1754 int journal_file_move_to_entry_by_realtime(
1755                 JournalFile *f,
1756                 uint64_t realtime,
1757                 direction_t direction,
1758                 Object **ret,
1759                 uint64_t *offset) {
1760
1761         return generic_array_bisect(f,
1762                                     le64toh(f->header->entry_array_offset),
1763                                     le64toh(f->header->n_entries),
1764                                     realtime,
1765                                     test_object_realtime,
1766                                     direction,
1767                                     ret, offset, NULL);
1768 }
1769
1770 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1771         Object *o;
1772         int r;
1773
1774         assert(f);
1775         assert(p > 0);
1776
1777         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1778         if (r < 0)
1779                 return r;
1780
1781         if (le64toh(o->entry.monotonic) == needle)
1782                 return TEST_FOUND;
1783         else if (le64toh(o->entry.monotonic) < needle)
1784                 return TEST_LEFT;
1785         else
1786                 return TEST_RIGHT;
1787 }
1788
1789 int journal_file_move_to_entry_by_monotonic(
1790                 JournalFile *f,
1791                 sd_id128_t boot_id,
1792                 uint64_t monotonic,
1793                 direction_t direction,
1794                 Object **ret,
1795                 uint64_t *offset) {
1796
1797         char t[9+32+1] = "_BOOT_ID=";
1798         Object *o;
1799         int r;
1800
1801         assert(f);
1802
1803         sd_id128_to_string(boot_id, t + 9);
1804         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1805         if (r < 0)
1806                 return r;
1807         if (r == 0)
1808                 return -ENOENT;
1809
1810         return generic_array_bisect_plus_one(f,
1811                                              le64toh(o->data.entry_offset),
1812                                              le64toh(o->data.entry_array_offset),
1813                                              le64toh(o->data.n_entries),
1814                                              monotonic,
1815                                              test_object_monotonic,
1816                                              direction,
1817                                              ret, offset, NULL);
1818 }
1819
1820 int journal_file_next_entry(
1821                 JournalFile *f,
1822                 Object *o, uint64_t p,
1823                 direction_t direction,
1824                 Object **ret, uint64_t *offset) {
1825
1826         uint64_t i, n;
1827         int r;
1828
1829         assert(f);
1830         assert(p > 0 || !o);
1831
1832         n = le64toh(f->header->n_entries);
1833         if (n <= 0)
1834                 return 0;
1835
1836         if (!o)
1837                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1838         else {
1839                 if (o->object.type != OBJECT_ENTRY)
1840                         return -EINVAL;
1841
1842                 r = generic_array_bisect(f,
1843                                          le64toh(f->header->entry_array_offset),
1844                                          le64toh(f->header->n_entries),
1845                                          p,
1846                                          test_object_offset,
1847                                          DIRECTION_DOWN,
1848                                          NULL, NULL,
1849                                          &i);
1850                 if (r <= 0)
1851                         return r;
1852
1853                 if (direction == DIRECTION_DOWN) {
1854                         if (i >= n - 1)
1855                                 return 0;
1856
1857                         i++;
1858                 } else {
1859                         if (i <= 0)
1860                                 return 0;
1861
1862                         i--;
1863                 }
1864         }
1865
1866         /* And jump to it */
1867         return generic_array_get(f,
1868                                  le64toh(f->header->entry_array_offset),
1869                                  i,
1870                                  ret, offset);
1871 }
1872
1873 int journal_file_skip_entry(
1874                 JournalFile *f,
1875                 Object *o, uint64_t p,
1876                 int64_t skip,
1877                 Object **ret, uint64_t *offset) {
1878
1879         uint64_t i, n;
1880         int r;
1881
1882         assert(f);
1883         assert(o);
1884         assert(p > 0);
1885
1886         if (o->object.type != OBJECT_ENTRY)
1887                 return -EINVAL;
1888
1889         r = generic_array_bisect(f,
1890                                  le64toh(f->header->entry_array_offset),
1891                                  le64toh(f->header->n_entries),
1892                                  p,
1893                                  test_object_offset,
1894                                  DIRECTION_DOWN,
1895                                  NULL, NULL,
1896                                  &i);
1897         if (r <= 0)
1898                 return r;
1899
1900         /* Calculate new index */
1901         if (skip < 0) {
1902                 if ((uint64_t) -skip >= i)
1903                         i = 0;
1904                 else
1905                         i = i - (uint64_t) -skip;
1906         } else
1907                 i  += (uint64_t) skip;
1908
1909         n = le64toh(f->header->n_entries);
1910         if (n <= 0)
1911                 return -EBADMSG;
1912
1913         if (i >= n)
1914                 i = n-1;
1915
1916         return generic_array_get(f,
1917                                  le64toh(f->header->entry_array_offset),
1918                                  i,
1919                                  ret, offset);
1920 }
1921
1922 int journal_file_next_entry_for_data(
1923                 JournalFile *f,
1924                 Object *o, uint64_t p,
1925                 uint64_t data_offset,
1926                 direction_t direction,
1927                 Object **ret, uint64_t *offset) {
1928
1929         uint64_t n, i;
1930         int r;
1931         Object *d;
1932
1933         assert(f);
1934         assert(p > 0 || !o);
1935
1936         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1937         if (r < 0)
1938                 return r;
1939
1940         n = le64toh(d->data.n_entries);
1941         if (n <= 0)
1942                 return n;
1943
1944         if (!o)
1945                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1946         else {
1947                 if (o->object.type != OBJECT_ENTRY)
1948                         return -EINVAL;
1949
1950                 r = generic_array_bisect_plus_one(f,
1951                                                   le64toh(d->data.entry_offset),
1952                                                   le64toh(d->data.entry_array_offset),
1953                                                   le64toh(d->data.n_entries),
1954                                                   p,
1955                                                   test_object_offset,
1956                                                   DIRECTION_DOWN,
1957                                                   NULL, NULL,
1958                                                   &i);
1959
1960                 if (r <= 0)
1961                         return r;
1962
1963                 if (direction == DIRECTION_DOWN) {
1964                         if (i >= n - 1)
1965                                 return 0;
1966
1967                         i++;
1968                 } else {
1969                         if (i <= 0)
1970                                 return 0;
1971
1972                         i--;
1973                 }
1974
1975         }
1976
1977         return generic_array_get_plus_one(f,
1978                                           le64toh(d->data.entry_offset),
1979                                           le64toh(d->data.entry_array_offset),
1980                                           i,
1981                                           ret, offset);
1982 }
1983
1984 int journal_file_move_to_entry_by_offset_for_data(
1985                 JournalFile *f,
1986                 uint64_t data_offset,
1987                 uint64_t p,
1988                 direction_t direction,
1989                 Object **ret, uint64_t *offset) {
1990
1991         int r;
1992         Object *d;
1993
1994         assert(f);
1995
1996         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1997         if (r < 0)
1998                 return r;
1999
2000         return generic_array_bisect_plus_one(f,
2001                                              le64toh(d->data.entry_offset),
2002                                              le64toh(d->data.entry_array_offset),
2003                                              le64toh(d->data.n_entries),
2004                                              p,
2005                                              test_object_offset,
2006                                              direction,
2007                                              ret, offset, NULL);
2008 }
2009
2010 int journal_file_move_to_entry_by_monotonic_for_data(
2011                 JournalFile *f,
2012                 uint64_t data_offset,
2013                 sd_id128_t boot_id,
2014                 uint64_t monotonic,
2015                 direction_t direction,
2016                 Object **ret, uint64_t *offset) {
2017
2018         char t[9+32+1] = "_BOOT_ID=";
2019         Object *o, *d;
2020         int r;
2021         uint64_t b, z;
2022
2023         assert(f);
2024
2025         /* First, seek by time */
2026         sd_id128_to_string(boot_id, t + 9);
2027         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
2028         if (r < 0)
2029                 return r;
2030         if (r == 0)
2031                 return -ENOENT;
2032
2033         r = generic_array_bisect_plus_one(f,
2034                                           le64toh(o->data.entry_offset),
2035                                           le64toh(o->data.entry_array_offset),
2036                                           le64toh(o->data.n_entries),
2037                                           monotonic,
2038                                           test_object_monotonic,
2039                                           direction,
2040                                           NULL, &z, NULL);
2041         if (r <= 0)
2042                 return r;
2043
2044         /* And now, continue seeking until we find an entry that
2045          * exists in both bisection arrays */
2046
2047         for (;;) {
2048                 Object *qo;
2049                 uint64_t p, q;
2050
2051                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2052                 if (r < 0)
2053                         return r;
2054
2055                 r = generic_array_bisect_plus_one(f,
2056                                                   le64toh(d->data.entry_offset),
2057                                                   le64toh(d->data.entry_array_offset),
2058                                                   le64toh(d->data.n_entries),
2059                                                   z,
2060                                                   test_object_offset,
2061                                                   direction,
2062                                                   NULL, &p, NULL);
2063                 if (r <= 0)
2064                         return r;
2065
2066                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2067                 if (r < 0)
2068                         return r;
2069
2070                 r = generic_array_bisect_plus_one(f,
2071                                                   le64toh(o->data.entry_offset),
2072                                                   le64toh(o->data.entry_array_offset),
2073                                                   le64toh(o->data.n_entries),
2074                                                   p,
2075                                                   test_object_offset,
2076                                                   direction,
2077                                                   &qo, &q, NULL);
2078
2079                 if (r <= 0)
2080                         return r;
2081
2082                 if (p == q) {
2083                         if (ret)
2084                                 *ret = qo;
2085                         if (offset)
2086                                 *offset = q;
2087
2088                         return 1;
2089                 }
2090
2091                 z = q;
2092         }
2093
2094         return 0;
2095 }
2096
2097 int journal_file_move_to_entry_by_seqnum_for_data(
2098                 JournalFile *f,
2099                 uint64_t data_offset,
2100                 uint64_t seqnum,
2101                 direction_t direction,
2102                 Object **ret, uint64_t *offset) {
2103
2104         Object *d;
2105         int r;
2106
2107         assert(f);
2108
2109         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2110         if (r < 0)
2111                 return r;
2112
2113         return generic_array_bisect_plus_one(f,
2114                                              le64toh(d->data.entry_offset),
2115                                              le64toh(d->data.entry_array_offset),
2116                                              le64toh(d->data.n_entries),
2117                                              seqnum,
2118                                              test_object_seqnum,
2119                                              direction,
2120                                              ret, offset, NULL);
2121 }
2122
2123 int journal_file_move_to_entry_by_realtime_for_data(
2124                 JournalFile *f,
2125                 uint64_t data_offset,
2126                 uint64_t realtime,
2127                 direction_t direction,
2128                 Object **ret, uint64_t *offset) {
2129
2130         Object *d;
2131         int r;
2132
2133         assert(f);
2134
2135         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2136         if (r < 0)
2137                 return r;
2138
2139         return generic_array_bisect_plus_one(f,
2140                                              le64toh(d->data.entry_offset),
2141                                              le64toh(d->data.entry_array_offset),
2142                                              le64toh(d->data.n_entries),
2143                                              realtime,
2144                                              test_object_realtime,
2145                                              direction,
2146                                              ret, offset, NULL);
2147 }
2148
2149 void journal_file_dump(JournalFile *f) {
2150         Object *o;
2151         int r;
2152         uint64_t p;
2153
2154         assert(f);
2155
2156         journal_file_print_header(f);
2157
2158         p = le64toh(f->header->header_size);
2159         while (p != 0) {
2160                 r = journal_file_move_to_object(f, -1, p, &o);
2161                 if (r < 0)
2162                         goto fail;
2163
2164                 switch (o->object.type) {
2165
2166                 case OBJECT_UNUSED:
2167                         printf("Type: OBJECT_UNUSED\n");
2168                         break;
2169
2170                 case OBJECT_DATA:
2171                         printf("Type: OBJECT_DATA\n");
2172                         break;
2173
2174                 case OBJECT_FIELD:
2175                         printf("Type: OBJECT_FIELD\n");
2176                         break;
2177
2178                 case OBJECT_ENTRY:
2179                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
2180                                (unsigned long long) le64toh(o->entry.seqnum),
2181                                (unsigned long long) le64toh(o->entry.monotonic),
2182                                (unsigned long long) le64toh(o->entry.realtime));
2183                         break;
2184
2185                 case OBJECT_FIELD_HASH_TABLE:
2186                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2187                         break;
2188
2189                 case OBJECT_DATA_HASH_TABLE:
2190                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2191                         break;
2192
2193                 case OBJECT_ENTRY_ARRAY:
2194                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2195                         break;
2196
2197                 case OBJECT_TAG:
2198                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
2199                                (unsigned long long) le64toh(o->tag.seqnum),
2200                                (unsigned long long) le64toh(o->tag.epoch));
2201                         break;
2202
2203                 default:
2204                         printf("Type: unknown (%u)\n", o->object.type);
2205                         break;
2206                 }
2207
2208                 if (o->object.flags & OBJECT_COMPRESSED)
2209                         printf("Flags: COMPRESSED\n");
2210
2211                 if (p == le64toh(f->header->tail_object_offset))
2212                         p = 0;
2213                 else
2214                         p = p + ALIGN64(le64toh(o->object.size));
2215         }
2216
2217         return;
2218 fail:
2219         log_error("File corrupt");
2220 }
2221
2222 void journal_file_print_header(JournalFile *f) {
2223         char a[33], b[33], c[33];
2224         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2225         struct stat st;
2226         char bytes[FORMAT_BYTES_MAX];
2227
2228         assert(f);
2229
2230         printf("File Path: %s\n"
2231                "File ID: %s\n"
2232                "Machine ID: %s\n"
2233                "Boot ID: %s\n"
2234                "Sequential Number ID: %s\n"
2235                "State: %s\n"
2236                "Compatible Flags:%s%s\n"
2237                "Incompatible Flags:%s%s\n"
2238                "Header size: %llu\n"
2239                "Arena size: %llu\n"
2240                "Data Hash Table Size: %llu\n"
2241                "Field Hash Table Size: %llu\n"
2242                "Rotate Suggested: %s\n"
2243                "Head Sequential Number: %llu\n"
2244                "Tail Sequential Number: %llu\n"
2245                "Head Realtime Timestamp: %s\n"
2246                "Tail Realtime Timestamp: %s\n"
2247                "Objects: %llu\n"
2248                "Entry Objects: %llu\n",
2249                f->path,
2250                sd_id128_to_string(f->header->file_id, a),
2251                sd_id128_to_string(f->header->machine_id, b),
2252                sd_id128_to_string(f->header->boot_id, c),
2253                sd_id128_to_string(f->header->seqnum_id, c),
2254                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2255                f->header->state == STATE_ONLINE ? "ONLINE" :
2256                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2257                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2258                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2259                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2260                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2261                (unsigned long long) le64toh(f->header->header_size),
2262                (unsigned long long) le64toh(f->header->arena_size),
2263                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2264                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2265                yes_no(journal_file_rotate_suggested(f, 0)),
2266                (unsigned long long) le64toh(f->header->head_entry_seqnum),
2267                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2268                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2269                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2270                (unsigned long long) le64toh(f->header->n_objects),
2271                (unsigned long long) le64toh(f->header->n_entries));
2272
2273         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2274                 printf("Data Objects: %llu\n"
2275                        "Data Hash Table Fill: %.1f%%\n",
2276                        (unsigned long long) le64toh(f->header->n_data),
2277                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2278
2279         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2280                 printf("Field Objects: %llu\n"
2281                        "Field Hash Table Fill: %.1f%%\n",
2282                        (unsigned long long) le64toh(f->header->n_fields),
2283                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2284
2285         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2286                 printf("Tag Objects: %llu\n",
2287                        (unsigned long long) le64toh(f->header->n_tags));
2288         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2289                 printf("Entry Array Objects: %llu\n",
2290                        (unsigned long long) le64toh(f->header->n_entry_arrays));
2291
2292         if (fstat(f->fd, &st) >= 0)
2293                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2294 }
2295
2296 int journal_file_open(
2297                 const char *fname,
2298                 int flags,
2299                 mode_t mode,
2300                 bool compress,
2301                 bool seal,
2302                 JournalMetrics *metrics,
2303                 MMapCache *mmap_cache,
2304                 JournalFile *template,
2305                 JournalFile **ret) {
2306
2307         JournalFile *f;
2308         int r;
2309         bool newly_created = false;
2310
2311         assert(fname);
2312         assert(ret);
2313
2314         if ((flags & O_ACCMODE) != O_RDONLY &&
2315             (flags & O_ACCMODE) != O_RDWR)
2316                 return -EINVAL;
2317
2318         if (!endswith(fname, ".journal") &&
2319             !endswith(fname, ".journal~"))
2320                 return -EINVAL;
2321
2322         f = new0(JournalFile, 1);
2323         if (!f)
2324                 return -ENOMEM;
2325
2326         f->fd = -1;
2327         f->mode = mode;
2328
2329         f->flags = flags;
2330         f->prot = prot_from_flags(flags);
2331         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2332 #ifdef HAVE_XZ
2333         f->compress = compress;
2334 #endif
2335 #ifdef HAVE_GCRYPT
2336         f->seal = seal;
2337 #endif
2338
2339         if (mmap_cache)
2340                 f->mmap = mmap_cache_ref(mmap_cache);
2341         else {
2342                 f->mmap = mmap_cache_new();
2343                 if (!f->mmap) {
2344                         r = -ENOMEM;
2345                         goto fail;
2346                 }
2347         }
2348
2349         f->path = strdup(fname);
2350         if (!f->path) {
2351                 r = -ENOMEM;
2352                 goto fail;
2353         }
2354
2355         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2356         if (!f->chain_cache) {
2357                 r = -ENOMEM;
2358                 goto fail;
2359         }
2360
2361         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2362         if (f->fd < 0) {
2363                 r = -errno;
2364                 goto fail;
2365         }
2366
2367         if (fstat(f->fd, &f->last_stat) < 0) {
2368                 r = -errno;
2369                 goto fail;
2370         }
2371
2372         if (f->last_stat.st_size == 0 && f->writable) {
2373 #ifdef HAVE_XATTR
2374                 uint64_t crtime;
2375
2376                 /* Let's attach the creation time to the journal file,
2377                  * so that the vacuuming code knows the age of this
2378                  * file even if the file might end up corrupted one
2379                  * day... Ideally we'd just use the creation time many
2380                  * file systems maintain for each file, but there is
2381                  * currently no usable API to query this, hence let's
2382                  * emulate this via extended attributes. If extended
2383                  * attributes are not supported we'll just skip this,
2384                  * and rely solely on mtime/atime/ctime of the file.*/
2385
2386                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2387                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2388 #endif
2389
2390 #ifdef HAVE_GCRYPT
2391                 /* Try to load the FSPRG state, and if we can't, then
2392                  * just don't do sealing */
2393                 if (f->seal) {
2394                         r = journal_file_fss_load(f);
2395                         if (r < 0)
2396                                 f->seal = false;
2397                 }
2398 #endif
2399
2400                 r = journal_file_init_header(f, template);
2401                 if (r < 0)
2402                         goto fail;
2403
2404                 if (fstat(f->fd, &f->last_stat) < 0) {
2405                         r = -errno;
2406                         goto fail;
2407                 }
2408
2409                 newly_created = true;
2410         }
2411
2412         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2413                 r = -EIO;
2414                 goto fail;
2415         }
2416
2417         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2418         if (f->header == MAP_FAILED) {
2419                 f->header = NULL;
2420                 r = -errno;
2421                 goto fail;
2422         }
2423
2424         if (!newly_created) {
2425                 r = journal_file_verify_header(f);
2426                 if (r < 0)
2427                         goto fail;
2428         }
2429
2430 #ifdef HAVE_GCRYPT
2431         if (!newly_created && f->writable) {
2432                 r = journal_file_fss_load(f);
2433                 if (r < 0)
2434                         goto fail;
2435         }
2436 #endif
2437
2438         if (f->writable) {
2439                 if (metrics) {
2440                         journal_default_metrics(metrics, f->fd);
2441                         f->metrics = *metrics;
2442                 } else if (template)
2443                         f->metrics = template->metrics;
2444
2445                 r = journal_file_refresh_header(f);
2446                 if (r < 0)
2447                         goto fail;
2448         }
2449
2450 #ifdef HAVE_GCRYPT
2451         r = journal_file_hmac_setup(f);
2452         if (r < 0)
2453                 goto fail;
2454 #endif
2455
2456         if (newly_created) {
2457                 r = journal_file_setup_field_hash_table(f);
2458                 if (r < 0)
2459                         goto fail;
2460
2461                 r = journal_file_setup_data_hash_table(f);
2462                 if (r < 0)
2463                         goto fail;
2464
2465 #ifdef HAVE_GCRYPT
2466                 r = journal_file_append_first_tag(f);
2467                 if (r < 0)
2468                         goto fail;
2469 #endif
2470         }
2471
2472         r = journal_file_map_field_hash_table(f);
2473         if (r < 0)
2474                 goto fail;
2475
2476         r = journal_file_map_data_hash_table(f);
2477         if (r < 0)
2478                 goto fail;
2479
2480         *ret = f;
2481         return 0;
2482
2483 fail:
2484         journal_file_close(f);
2485
2486         return r;
2487 }
2488
2489 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2490         char *p;
2491         size_t l;
2492         JournalFile *old_file, *new_file = NULL;
2493         int r;
2494
2495         assert(f);
2496         assert(*f);
2497
2498         old_file = *f;
2499
2500         if (!old_file->writable)
2501                 return -EINVAL;
2502
2503         if (!endswith(old_file->path, ".journal"))
2504                 return -EINVAL;
2505
2506         l = strlen(old_file->path);
2507
2508         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2509         if (!p)
2510                 return -ENOMEM;
2511
2512         memcpy(p, old_file->path, l - 8);
2513         p[l-8] = '@';
2514         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2515         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2516                  "-%016llx-%016llx.journal",
2517                  (unsigned long long) le64toh((*f)->header->head_entry_seqnum),
2518                  (unsigned long long) le64toh((*f)->header->head_entry_realtime));
2519
2520         r = rename(old_file->path, p);
2521         free(p);
2522
2523         if (r < 0)
2524                 return -errno;
2525
2526         old_file->header->state = STATE_ARCHIVED;
2527
2528         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2529         journal_file_close(old_file);
2530
2531         *f = new_file;
2532         return r;
2533 }
2534
2535 int journal_file_open_reliably(
2536                 const char *fname,
2537                 int flags,
2538                 mode_t mode,
2539                 bool compress,
2540                 bool seal,
2541                 JournalMetrics *metrics,
2542                 MMapCache *mmap_cache,
2543                 JournalFile *template,
2544                 JournalFile **ret) {
2545
2546         int r;
2547         size_t l;
2548         char *p;
2549
2550         r = journal_file_open(fname, flags, mode, compress, seal,
2551                               metrics, mmap_cache, template, ret);
2552         if (r != -EBADMSG && /* corrupted */
2553             r != -ENODATA && /* truncated */
2554             r != -EHOSTDOWN && /* other machine */
2555             r != -EPROTONOSUPPORT && /* incompatible feature */
2556             r != -EBUSY && /* unclean shutdown */
2557             r != -ESHUTDOWN /* already archived */)
2558                 return r;
2559
2560         if ((flags & O_ACCMODE) == O_RDONLY)
2561                 return r;
2562
2563         if (!(flags & O_CREAT))
2564                 return r;
2565
2566         if (!endswith(fname, ".journal"))
2567                 return r;
2568
2569         /* The file is corrupted. Rotate it away and try it again (but only once) */
2570
2571         l = strlen(fname);
2572         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2573                      (int) (l-8), fname,
2574                      (unsigned long long) now(CLOCK_REALTIME),
2575                      random_ull()) < 0)
2576                 return -ENOMEM;
2577
2578         r = rename(fname, p);
2579         free(p);
2580         if (r < 0)
2581                 return -errno;
2582
2583         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2584
2585         return journal_file_open(fname, flags, mode, compress, seal,
2586                                  metrics, mmap_cache, template, ret);
2587 }
2588
2589
2590 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2591         uint64_t i, n;
2592         uint64_t q, xor_hash = 0;
2593         int r;
2594         EntryItem *items;
2595         dual_timestamp ts;
2596
2597         assert(from);
2598         assert(to);
2599         assert(o);
2600         assert(p);
2601
2602         if (!to->writable)
2603                 return -EPERM;
2604
2605         ts.monotonic = le64toh(o->entry.monotonic);
2606         ts.realtime = le64toh(o->entry.realtime);
2607
2608         if (to->tail_entry_monotonic_valid &&
2609             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2610                 return -EINVAL;
2611
2612         n = journal_file_entry_n_items(o);
2613         items = alloca(sizeof(EntryItem) * n);
2614
2615         for (i = 0; i < n; i++) {
2616                 uint64_t l, h;
2617                 le64_t le_hash;
2618                 size_t t;
2619                 void *data;
2620                 Object *u;
2621
2622                 q = le64toh(o->entry.items[i].object_offset);
2623                 le_hash = o->entry.items[i].hash;
2624
2625                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2626                 if (r < 0)
2627                         return r;
2628
2629                 if (le_hash != o->data.hash)
2630                         return -EBADMSG;
2631
2632                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2633                 t = (size_t) l;
2634
2635                 /* We hit the limit on 32bit machines */
2636                 if ((uint64_t) t != l)
2637                         return -E2BIG;
2638
2639                 if (o->object.flags & OBJECT_COMPRESSED) {
2640 #ifdef HAVE_XZ
2641                         uint64_t rsize;
2642
2643                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2644                                 return -EBADMSG;
2645
2646                         data = from->compress_buffer;
2647                         l = rsize;
2648 #else
2649                         return -EPROTONOSUPPORT;
2650 #endif
2651                 } else
2652                         data = o->data.payload;
2653
2654                 r = journal_file_append_data(to, data, l, &u, &h);
2655                 if (r < 0)
2656                         return r;
2657
2658                 xor_hash ^= le64toh(u->data.hash);
2659                 items[i].object_offset = htole64(h);
2660                 items[i].hash = u->data.hash;
2661
2662                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2663                 if (r < 0)
2664                         return r;
2665         }
2666
2667         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2668 }
2669
2670 void journal_default_metrics(JournalMetrics *m, int fd) {
2671         uint64_t fs_size = 0;
2672         struct statvfs ss;
2673         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2674
2675         assert(m);
2676         assert(fd >= 0);
2677
2678         if (fstatvfs(fd, &ss) >= 0)
2679                 fs_size = ss.f_frsize * ss.f_blocks;
2680
2681         if (m->max_use == (uint64_t) -1) {
2682
2683                 if (fs_size > 0) {
2684                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2685
2686                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2687                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2688
2689                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2690                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2691                 } else
2692                         m->max_use = DEFAULT_MAX_USE_LOWER;
2693         } else {
2694                 m->max_use = PAGE_ALIGN(m->max_use);
2695
2696                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2697                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2698         }
2699
2700         if (m->max_size == (uint64_t) -1) {
2701                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2702
2703                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2704                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2705         } else
2706                 m->max_size = PAGE_ALIGN(m->max_size);
2707
2708         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2709                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2710
2711         if (m->max_size*2 > m->max_use)
2712                 m->max_use = m->max_size*2;
2713
2714         if (m->min_size == (uint64_t) -1)
2715                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2716         else {
2717                 m->min_size = PAGE_ALIGN(m->min_size);
2718
2719                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2720                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2721
2722                 if (m->min_size > m->max_size)
2723                         m->max_size = m->min_size;
2724         }
2725
2726         if (m->keep_free == (uint64_t) -1) {
2727
2728                 if (fs_size > 0) {
2729                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2730
2731                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2732                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2733
2734                 } else
2735                         m->keep_free = DEFAULT_KEEP_FREE;
2736         }
2737
2738         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2739                   format_bytes(a, sizeof(a), m->max_use),
2740                   format_bytes(b, sizeof(b), m->max_size),
2741                   format_bytes(c, sizeof(c), m->min_size),
2742                   format_bytes(d, sizeof(d), m->keep_free));
2743 }
2744
2745 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2746         assert(f);
2747         assert(from || to);
2748
2749         if (from) {
2750                 if (f->header->head_entry_realtime == 0)
2751                         return -ENOENT;
2752
2753                 *from = le64toh(f->header->head_entry_realtime);
2754         }
2755
2756         if (to) {
2757                 if (f->header->tail_entry_realtime == 0)
2758                         return -ENOENT;
2759
2760                 *to = le64toh(f->header->tail_entry_realtime);
2761         }
2762
2763         return 1;
2764 }
2765
2766 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2767         char t[9+32+1] = "_BOOT_ID=";
2768         Object *o;
2769         uint64_t p;
2770         int r;
2771
2772         assert(f);
2773         assert(from || to);
2774
2775         sd_id128_to_string(boot_id, t + 9);
2776
2777         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2778         if (r <= 0)
2779                 return r;
2780
2781         if (le64toh(o->data.n_entries) <= 0)
2782                 return 0;
2783
2784         if (from) {
2785                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2786                 if (r < 0)
2787                         return r;
2788
2789                 *from = le64toh(o->entry.monotonic);
2790         }
2791
2792         if (to) {
2793                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2794                 if (r < 0)
2795                         return r;
2796
2797                 r = generic_array_get_plus_one(f,
2798                                                le64toh(o->data.entry_offset),
2799                                                le64toh(o->data.entry_array_offset),
2800                                                le64toh(o->data.n_entries)-1,
2801                                                &o, NULL);
2802                 if (r <= 0)
2803                         return r;
2804
2805                 *to = le64toh(o->entry.monotonic);
2806         }
2807
2808         return 1;
2809 }
2810
2811 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2812         assert(f);
2813
2814         /* If we gained new header fields we gained new features,
2815          * hence suggest a rotation */
2816         if (le64toh(f->header->header_size) < sizeof(Header)) {
2817                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2818                 return true;
2819         }
2820
2821         /* Let's check if the hash tables grew over a certain fill
2822          * level (75%, borrowing this value from Java's hash table
2823          * implementation), and if so suggest a rotation. To calculate
2824          * the fill level we need the n_data field, which only exists
2825          * in newer versions. */
2826
2827         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2828                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2829                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2830                                   f->path,
2831                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2832                                   (unsigned long long) le64toh(f->header->n_data),
2833                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2834                                   (unsigned long long) (f->last_stat.st_size),
2835                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2836                         return true;
2837                 }
2838
2839         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2840                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2841                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2842                                   f->path,
2843                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2844                                   (unsigned long long) le64toh(f->header->n_fields),
2845                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2846                         return true;
2847                 }
2848
2849         /* Are the data objects properly indexed by field objects? */
2850         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2851             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2852             le64toh(f->header->n_data) > 0 &&
2853             le64toh(f->header->n_fields) == 0)
2854                 return true;
2855
2856         if (max_file_usec > 0) {
2857                 usec_t t, h;
2858
2859                 h = le64toh(f->header->head_entry_realtime);
2860                 t = now(CLOCK_REALTIME);
2861
2862                 if (h > 0 && t > h + max_file_usec)
2863                         return true;
2864         }
2865
2866         return false;
2867 }