chiark / gitweb /
journal: special case the trivial cache chain cache entry
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)                  /* 64 KiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 void journal_file_close(JournalFile *f) {
72         assert(f);
73
74 #ifdef HAVE_GCRYPT
75         /* Write the final tag */
76         if (f->seal && f->writable)
77                 journal_file_append_tag(f);
78 #endif
79
80         /* Sync everything to disk, before we mark the file offline */
81         if (f->mmap && f->fd >= 0)
82                 mmap_cache_close_fd(f->mmap, f->fd);
83
84         if (f->writable && f->fd >= 0)
85                 fdatasync(f->fd);
86
87         if (f->header) {
88                 /* Mark the file offline. Don't override the archived state if it already is set */
89                 if (f->writable && f->header->state == STATE_ONLINE)
90                         f->header->state = STATE_OFFLINE;
91
92                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
93         }
94
95         if (f->fd >= 0)
96                 close_nointr_nofail(f->fd);
97
98         free(f->path);
99
100         if (f->mmap)
101                 mmap_cache_unref(f->mmap);
102
103         hashmap_free_free(f->chain_cache);
104
105 #ifdef HAVE_XZ
106         free(f->compress_buffer);
107 #endif
108
109 #ifdef HAVE_GCRYPT
110         if (f->fss_file)
111                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
112         else if (f->fsprg_state)
113                 free(f->fsprg_state);
114
115         free(f->fsprg_seed);
116
117         if (f->hmac)
118                 gcry_md_close(f->hmac);
119 #endif
120
121         free(f);
122 }
123
124 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
125         Header h;
126         ssize_t k;
127         int r;
128
129         assert(f);
130
131         zero(h);
132         memcpy(h.signature, HEADER_SIGNATURE, 8);
133         h.header_size = htole64(ALIGN64(sizeof(h)));
134
135         h.incompatible_flags =
136                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
137
138         h.compatible_flags =
139                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
140
141         r = sd_id128_randomize(&h.file_id);
142         if (r < 0)
143                 return r;
144
145         if (template) {
146                 h.seqnum_id = template->header->seqnum_id;
147                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
148         } else
149                 h.seqnum_id = h.file_id;
150
151         k = pwrite(f->fd, &h, sizeof(h), 0);
152         if (k < 0)
153                 return -errno;
154
155         if (k != sizeof(h))
156                 return -EIO;
157
158         return 0;
159 }
160
161 static int journal_file_refresh_header(JournalFile *f) {
162         int r;
163         sd_id128_t boot_id;
164
165         assert(f);
166
167         r = sd_id128_get_machine(&f->header->machine_id);
168         if (r < 0)
169                 return r;
170
171         r = sd_id128_get_boot(&boot_id);
172         if (r < 0)
173                 return r;
174
175         if (sd_id128_equal(boot_id, f->header->boot_id))
176                 f->tail_entry_monotonic_valid = true;
177
178         f->header->boot_id = boot_id;
179
180         f->header->state = STATE_ONLINE;
181
182         /* Sync the online state to disk */
183         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
184         fdatasync(f->fd);
185
186         return 0;
187 }
188
189 static int journal_file_verify_header(JournalFile *f) {
190         assert(f);
191
192         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
193                 return -EBADMSG;
194
195         /* In both read and write mode we refuse to open files with
196          * incompatible flags we don't know */
197 #ifdef HAVE_XZ
198         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
199                 return -EPROTONOSUPPORT;
200 #else
201         if (f->header->incompatible_flags != 0)
202                 return -EPROTONOSUPPORT;
203 #endif
204
205         /* When open for writing we refuse to open files with
206          * compatible flags, too */
207         if (f->writable) {
208 #ifdef HAVE_GCRYPT
209                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
210                         return -EPROTONOSUPPORT;
211 #else
212                 if (f->header->compatible_flags != 0)
213                         return -EPROTONOSUPPORT;
214 #endif
215         }
216
217         if (f->header->state >= _STATE_MAX)
218                 return -EBADMSG;
219
220         /* The first addition was n_data, so check that we are at least this large */
221         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
222                 return -EBADMSG;
223
224         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
225                 return -EBADMSG;
226
227         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
228                 return -ENODATA;
229
230         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
231                 return -ENODATA;
232
233         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
234             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
235             !VALID64(le64toh(f->header->tail_object_offset)) ||
236             !VALID64(le64toh(f->header->entry_array_offset)))
237                 return -ENODATA;
238
239         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
240             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
241             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
242             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
243                 return -ENODATA;
244
245         if (f->writable) {
246                 uint8_t state;
247                 sd_id128_t machine_id;
248                 int r;
249
250                 r = sd_id128_get_machine(&machine_id);
251                 if (r < 0)
252                         return r;
253
254                 if (!sd_id128_equal(machine_id, f->header->machine_id))
255                         return -EHOSTDOWN;
256
257                 state = f->header->state;
258
259                 if (state == STATE_ONLINE) {
260                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
261                         return -EBUSY;
262                 } else if (state == STATE_ARCHIVED)
263                         return -ESHUTDOWN;
264                 else if (state != STATE_OFFLINE) {
265                         log_debug("Journal file %s has unknown state %u.", f->path, state);
266                         return -EBUSY;
267                 }
268         }
269
270         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
271
272         f->seal = JOURNAL_HEADER_SEALED(f->header);
273
274         return 0;
275 }
276
277 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
278         uint64_t old_size, new_size;
279         int r;
280
281         assert(f);
282
283         /* We assume that this file is not sparse, and we know that
284          * for sure, since we always call posix_fallocate()
285          * ourselves */
286
287         old_size =
288                 le64toh(f->header->header_size) +
289                 le64toh(f->header->arena_size);
290
291         new_size = PAGE_ALIGN(offset + size);
292         if (new_size < le64toh(f->header->header_size))
293                 new_size = le64toh(f->header->header_size);
294
295         if (new_size <= old_size)
296                 return 0;
297
298         if (f->metrics.max_size > 0 &&
299             new_size > f->metrics.max_size)
300                 return -E2BIG;
301
302         if (new_size > f->metrics.min_size &&
303             f->metrics.keep_free > 0) {
304                 struct statvfs svfs;
305
306                 if (fstatvfs(f->fd, &svfs) >= 0) {
307                         uint64_t available;
308
309                         available = svfs.f_bfree * svfs.f_bsize;
310
311                         if (available >= f->metrics.keep_free)
312                                 available -= f->metrics.keep_free;
313                         else
314                                 available = 0;
315
316                         if (new_size - old_size > available)
317                                 return -E2BIG;
318                 }
319         }
320
321         /* Note that the glibc fallocate() fallback is very
322            inefficient, hence we try to minimize the allocation area
323            as we can. */
324         r = posix_fallocate(f->fd, old_size, new_size - old_size);
325         if (r != 0)
326                 return -r;
327
328         if (fstat(f->fd, &f->last_stat) < 0)
329                 return -errno;
330
331         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
332
333         return 0;
334 }
335
336 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
337         assert(f);
338         assert(ret);
339
340         if (size <= 0)
341                 return -EINVAL;
342
343         /* Avoid SIGBUS on invalid accesses */
344         if (offset + size > (uint64_t) f->last_stat.st_size) {
345                 /* Hmm, out of range? Let's refresh the fstat() data
346                  * first, before we trust that check. */
347
348                 if (fstat(f->fd, &f->last_stat) < 0 ||
349                     offset + size > (uint64_t) f->last_stat.st_size)
350                         return -EADDRNOTAVAIL;
351         }
352
353         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
354 }
355
356 static uint64_t minimum_header_size(Object *o) {
357
358         static uint64_t table[] = {
359                 [OBJECT_DATA] = sizeof(DataObject),
360                 [OBJECT_FIELD] = sizeof(FieldObject),
361                 [OBJECT_ENTRY] = sizeof(EntryObject),
362                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
363                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
364                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
365                 [OBJECT_TAG] = sizeof(TagObject),
366         };
367
368         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
369                 return sizeof(ObjectHeader);
370
371         return table[o->object.type];
372 }
373
374 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
375         int r;
376         void *t;
377         Object *o;
378         uint64_t s;
379         unsigned context;
380
381         assert(f);
382         assert(ret);
383
384         /* Objects may only be located at multiple of 64 bit */
385         if (!VALID64(offset))
386                 return -EFAULT;
387
388         /* One context for each type, plus one catch-all for the rest */
389         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
390
391         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
392         if (r < 0)
393                 return r;
394
395         o = (Object*) t;
396         s = le64toh(o->object.size);
397
398         if (s < sizeof(ObjectHeader))
399                 return -EBADMSG;
400
401         if (o->object.type <= OBJECT_UNUSED)
402                 return -EBADMSG;
403
404         if (s < minimum_header_size(o))
405                 return -EBADMSG;
406
407         if (type > 0 && o->object.type != type)
408                 return -EBADMSG;
409
410         if (s > sizeof(ObjectHeader)) {
411                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
412                 if (r < 0)
413                         return r;
414
415                 o = (Object*) t;
416         }
417
418         *ret = o;
419         return 0;
420 }
421
422 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
423         uint64_t r;
424
425         assert(f);
426
427         r = le64toh(f->header->tail_entry_seqnum) + 1;
428
429         if (seqnum) {
430                 /* If an external seqnum counter was passed, we update
431                  * both the local and the external one, and set it to
432                  * the maximum of both */
433
434                 if (*seqnum + 1 > r)
435                         r = *seqnum + 1;
436
437                 *seqnum = r;
438         }
439
440         f->header->tail_entry_seqnum = htole64(r);
441
442         if (f->header->head_entry_seqnum == 0)
443                 f->header->head_entry_seqnum = htole64(r);
444
445         return r;
446 }
447
448 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
449         int r;
450         uint64_t p;
451         Object *tail, *o;
452         void *t;
453
454         assert(f);
455         assert(type > 0 && type < _OBJECT_TYPE_MAX);
456         assert(size >= sizeof(ObjectHeader));
457         assert(offset);
458         assert(ret);
459
460         p = le64toh(f->header->tail_object_offset);
461         if (p == 0)
462                 p = le64toh(f->header->header_size);
463         else {
464                 r = journal_file_move_to_object(f, -1, p, &tail);
465                 if (r < 0)
466                         return r;
467
468                 p += ALIGN64(le64toh(tail->object.size));
469         }
470
471         r = journal_file_allocate(f, p, size);
472         if (r < 0)
473                 return r;
474
475         r = journal_file_move_to(f, type, false, p, size, &t);
476         if (r < 0)
477                 return r;
478
479         o = (Object*) t;
480
481         zero(o->object);
482         o->object.type = type;
483         o->object.size = htole64(size);
484
485         f->header->tail_object_offset = htole64(p);
486         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
487
488         *ret = o;
489         *offset = p;
490
491         return 0;
492 }
493
494 static int journal_file_setup_data_hash_table(JournalFile *f) {
495         uint64_t s, p;
496         Object *o;
497         int r;
498
499         assert(f);
500
501         /* We estimate that we need 1 hash table entry per 768 of
502            journal file and we want to make sure we never get beyond
503            75% fill level. Calculate the hash table size for the
504            maximum file size based on these metrics. */
505
506         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
507         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
508                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
509
510         log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
511
512         r = journal_file_append_object(f,
513                                        OBJECT_DATA_HASH_TABLE,
514                                        offsetof(Object, hash_table.items) + s,
515                                        &o, &p);
516         if (r < 0)
517                 return r;
518
519         memset(o->hash_table.items, 0, s);
520
521         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
522         f->header->data_hash_table_size = htole64(s);
523
524         return 0;
525 }
526
527 static int journal_file_setup_field_hash_table(JournalFile *f) {
528         uint64_t s, p;
529         Object *o;
530         int r;
531
532         assert(f);
533
534         /* We use a fixed size hash table for the fields as this
535          * number should grow very slowly only */
536
537         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
538         r = journal_file_append_object(f,
539                                        OBJECT_FIELD_HASH_TABLE,
540                                        offsetof(Object, hash_table.items) + s,
541                                        &o, &p);
542         if (r < 0)
543                 return r;
544
545         memset(o->hash_table.items, 0, s);
546
547         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
548         f->header->field_hash_table_size = htole64(s);
549
550         return 0;
551 }
552
553 static int journal_file_map_data_hash_table(JournalFile *f) {
554         uint64_t s, p;
555         void *t;
556         int r;
557
558         assert(f);
559
560         p = le64toh(f->header->data_hash_table_offset);
561         s = le64toh(f->header->data_hash_table_size);
562
563         r = journal_file_move_to(f,
564                                  OBJECT_DATA_HASH_TABLE,
565                                  true,
566                                  p, s,
567                                  &t);
568         if (r < 0)
569                 return r;
570
571         f->data_hash_table = t;
572         return 0;
573 }
574
575 static int journal_file_map_field_hash_table(JournalFile *f) {
576         uint64_t s, p;
577         void *t;
578         int r;
579
580         assert(f);
581
582         p = le64toh(f->header->field_hash_table_offset);
583         s = le64toh(f->header->field_hash_table_size);
584
585         r = journal_file_move_to(f,
586                                  OBJECT_FIELD_HASH_TABLE,
587                                  true,
588                                  p, s,
589                                  &t);
590         if (r < 0)
591                 return r;
592
593         f->field_hash_table = t;
594         return 0;
595 }
596
597 static int journal_file_link_field(
598                 JournalFile *f,
599                 Object *o,
600                 uint64_t offset,
601                 uint64_t hash) {
602
603         uint64_t p, h;
604         int r;
605
606         assert(f);
607         assert(o);
608         assert(offset > 0);
609
610         if (o->object.type != OBJECT_FIELD)
611                 return -EINVAL;
612
613         /* This might alter the window we are looking at */
614
615         o->field.next_hash_offset = o->field.head_data_offset = 0;
616
617         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
618         p = le64toh(f->field_hash_table[h].tail_hash_offset);
619         if (p == 0)
620                 f->field_hash_table[h].head_hash_offset = htole64(offset);
621         else {
622                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
623                 if (r < 0)
624                         return r;
625
626                 o->field.next_hash_offset = htole64(offset);
627         }
628
629         f->field_hash_table[h].tail_hash_offset = htole64(offset);
630
631         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
632                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
633
634         return 0;
635 }
636
637 static int journal_file_link_data(
638                 JournalFile *f,
639                 Object *o,
640                 uint64_t offset,
641                 uint64_t hash) {
642
643         uint64_t p, h;
644         int r;
645
646         assert(f);
647         assert(o);
648         assert(offset > 0);
649
650         if (o->object.type != OBJECT_DATA)
651                 return -EINVAL;
652
653         /* This might alter the window we are looking at */
654
655         o->data.next_hash_offset = o->data.next_field_offset = 0;
656         o->data.entry_offset = o->data.entry_array_offset = 0;
657         o->data.n_entries = 0;
658
659         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->data_hash_table[h].tail_hash_offset);
661         if (p == 0)
662                 /* Only entry in the hash table is easy */
663                 f->data_hash_table[h].head_hash_offset = htole64(offset);
664         else {
665                 /* Move back to the previous data object, to patch in
666                  * pointer */
667
668                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
669                 if (r < 0)
670                         return r;
671
672                 o->data.next_hash_offset = htole64(offset);
673         }
674
675         f->data_hash_table[h].tail_hash_offset = htole64(offset);
676
677         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
678                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
679
680         return 0;
681 }
682
683 int journal_file_find_field_object_with_hash(
684                 JournalFile *f,
685                 const void *field, uint64_t size, uint64_t hash,
686                 Object **ret, uint64_t *offset) {
687
688         uint64_t p, osize, h;
689         int r;
690
691         assert(f);
692         assert(field && size > 0);
693
694         osize = offsetof(Object, field.payload) + size;
695
696         if (f->header->field_hash_table_size == 0)
697                 return -EBADMSG;
698
699         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
700         p = le64toh(f->field_hash_table[h].head_hash_offset);
701
702         while (p > 0) {
703                 Object *o;
704
705                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
706                 if (r < 0)
707                         return r;
708
709                 if (le64toh(o->field.hash) == hash &&
710                     le64toh(o->object.size) == osize &&
711                     memcmp(o->field.payload, field, size) == 0) {
712
713                         if (ret)
714                                 *ret = o;
715                         if (offset)
716                                 *offset = p;
717
718                         return 1;
719                 }
720
721                 p = le64toh(o->field.next_hash_offset);
722         }
723
724         return 0;
725 }
726
727 int journal_file_find_field_object(
728                 JournalFile *f,
729                 const void *field, uint64_t size,
730                 Object **ret, uint64_t *offset) {
731
732         uint64_t hash;
733
734         assert(f);
735         assert(field && size > 0);
736
737         hash = hash64(field, size);
738
739         return journal_file_find_field_object_with_hash(f,
740                                                         field, size, hash,
741                                                         ret, offset);
742 }
743
744 int journal_file_find_data_object_with_hash(
745                 JournalFile *f,
746                 const void *data, uint64_t size, uint64_t hash,
747                 Object **ret, uint64_t *offset) {
748
749         uint64_t p, osize, h;
750         int r;
751
752         assert(f);
753         assert(data || size == 0);
754
755         osize = offsetof(Object, data.payload) + size;
756
757         if (f->header->data_hash_table_size == 0)
758                 return -EBADMSG;
759
760         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
761         p = le64toh(f->data_hash_table[h].head_hash_offset);
762
763         while (p > 0) {
764                 Object *o;
765
766                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
767                 if (r < 0)
768                         return r;
769
770                 if (le64toh(o->data.hash) != hash)
771                         goto next;
772
773                 if (o->object.flags & OBJECT_COMPRESSED) {
774 #ifdef HAVE_XZ
775                         uint64_t l, rsize;
776
777                         l = le64toh(o->object.size);
778                         if (l <= offsetof(Object, data.payload))
779                                 return -EBADMSG;
780
781                         l -= offsetof(Object, data.payload);
782
783                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
784                                 return -EBADMSG;
785
786                         if (rsize == size &&
787                             memcmp(f->compress_buffer, data, size) == 0) {
788
789                                 if (ret)
790                                         *ret = o;
791
792                                 if (offset)
793                                         *offset = p;
794
795                                 return 1;
796                         }
797 #else
798                         return -EPROTONOSUPPORT;
799 #endif
800
801                 } else if (le64toh(o->object.size) == osize &&
802                            memcmp(o->data.payload, data, size) == 0) {
803
804                         if (ret)
805                                 *ret = o;
806
807                         if (offset)
808                                 *offset = p;
809
810                         return 1;
811                 }
812
813         next:
814                 p = le64toh(o->data.next_hash_offset);
815         }
816
817         return 0;
818 }
819
820 int journal_file_find_data_object(
821                 JournalFile *f,
822                 const void *data, uint64_t size,
823                 Object **ret, uint64_t *offset) {
824
825         uint64_t hash;
826
827         assert(f);
828         assert(data || size == 0);
829
830         hash = hash64(data, size);
831
832         return journal_file_find_data_object_with_hash(f,
833                                                        data, size, hash,
834                                                        ret, offset);
835 }
836
837 static int journal_file_append_field(
838                 JournalFile *f,
839                 const void *field, uint64_t size,
840                 Object **ret, uint64_t *offset) {
841
842         uint64_t hash, p;
843         uint64_t osize;
844         Object *o;
845         int r;
846
847         assert(f);
848         assert(field && size > 0);
849
850         hash = hash64(field, size);
851
852         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
853         if (r < 0)
854                 return r;
855         else if (r > 0) {
856
857                 if (ret)
858                         *ret = o;
859
860                 if (offset)
861                         *offset = p;
862
863                 return 0;
864         }
865
866         osize = offsetof(Object, field.payload) + size;
867         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
868
869         o->field.hash = htole64(hash);
870         memcpy(o->field.payload, field, size);
871
872         r = journal_file_link_field(f, o, p, hash);
873         if (r < 0)
874                 return r;
875
876         /* The linking might have altered the window, so let's
877          * refresh our pointer */
878         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
879         if (r < 0)
880                 return r;
881
882 #ifdef HAVE_GCRYPT
883         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
884         if (r < 0)
885                 return r;
886 #endif
887
888         if (ret)
889                 *ret = o;
890
891         if (offset)
892                 *offset = p;
893
894         return 0;
895 }
896
897 static int journal_file_append_data(
898                 JournalFile *f,
899                 const void *data, uint64_t size,
900                 Object **ret, uint64_t *offset) {
901
902         uint64_t hash, p;
903         uint64_t osize;
904         Object *o;
905         int r;
906         bool compressed = false;
907         const void *eq;
908
909         assert(f);
910         assert(data || size == 0);
911
912         hash = hash64(data, size);
913
914         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
915         if (r < 0)
916                 return r;
917         else if (r > 0) {
918
919                 if (ret)
920                         *ret = o;
921
922                 if (offset)
923                         *offset = p;
924
925                 return 0;
926         }
927
928         osize = offsetof(Object, data.payload) + size;
929         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
930         if (r < 0)
931                 return r;
932
933         o->data.hash = htole64(hash);
934
935 #ifdef HAVE_XZ
936         if (f->compress &&
937             size >= COMPRESSION_SIZE_THRESHOLD) {
938                 uint64_t rsize;
939
940                 compressed = compress_blob(data, size, o->data.payload, &rsize);
941
942                 if (compressed) {
943                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
944                         o->object.flags |= OBJECT_COMPRESSED;
945
946                         log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
947                 }
948         }
949 #endif
950
951         if (!compressed && size > 0)
952                 memcpy(o->data.payload, data, size);
953
954         r = journal_file_link_data(f, o, p, hash);
955         if (r < 0)
956                 return r;
957
958         /* The linking might have altered the window, so let's
959          * refresh our pointer */
960         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
961         if (r < 0)
962                 return r;
963
964         eq = memchr(data, '=', size);
965         if (eq && eq > data) {
966                 uint64_t fp;
967                 Object *fo;
968
969                 /* Create field object ... */
970                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
971                 if (r < 0)
972                         return r;
973
974                 /* ... and link it in. */
975                 o->data.next_field_offset = fo->field.head_data_offset;
976                 fo->field.head_data_offset = le64toh(p);
977         }
978
979 #ifdef HAVE_GCRYPT
980         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
981         if (r < 0)
982                 return r;
983 #endif
984
985         if (ret)
986                 *ret = o;
987
988         if (offset)
989                 *offset = p;
990
991         return 0;
992 }
993
994 uint64_t journal_file_entry_n_items(Object *o) {
995         assert(o);
996
997         if (o->object.type != OBJECT_ENTRY)
998                 return 0;
999
1000         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1001 }
1002
1003 uint64_t journal_file_entry_array_n_items(Object *o) {
1004         assert(o);
1005
1006         if (o->object.type != OBJECT_ENTRY_ARRAY)
1007                 return 0;
1008
1009         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1010 }
1011
1012 uint64_t journal_file_hash_table_n_items(Object *o) {
1013         assert(o);
1014
1015         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1016             o->object.type != OBJECT_FIELD_HASH_TABLE)
1017                 return 0;
1018
1019         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1020 }
1021
1022 static int link_entry_into_array(JournalFile *f,
1023                                  le64_t *first,
1024                                  le64_t *idx,
1025                                  uint64_t p) {
1026         int r;
1027         uint64_t n = 0, ap = 0, q, i, a, hidx;
1028         Object *o;
1029
1030         assert(f);
1031         assert(first);
1032         assert(idx);
1033         assert(p > 0);
1034
1035         a = le64toh(*first);
1036         i = hidx = le64toh(*idx);
1037         while (a > 0) {
1038
1039                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1040                 if (r < 0)
1041                         return r;
1042
1043                 n = journal_file_entry_array_n_items(o);
1044                 if (i < n) {
1045                         o->entry_array.items[i] = htole64(p);
1046                         *idx = htole64(hidx + 1);
1047                         return 0;
1048                 }
1049
1050                 i -= n;
1051                 ap = a;
1052                 a = le64toh(o->entry_array.next_entry_array_offset);
1053         }
1054
1055         if (hidx > n)
1056                 n = (hidx+1) * 2;
1057         else
1058                 n = n * 2;
1059
1060         if (n < 4)
1061                 n = 4;
1062
1063         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1064                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1065                                        &o, &q);
1066         if (r < 0)
1067                 return r;
1068
1069 #ifdef HAVE_GCRYPT
1070         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1071         if (r < 0)
1072                 return r;
1073 #endif
1074
1075         o->entry_array.items[i] = htole64(p);
1076
1077         if (ap == 0)
1078                 *first = htole64(q);
1079         else {
1080                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1081                 if (r < 0)
1082                         return r;
1083
1084                 o->entry_array.next_entry_array_offset = htole64(q);
1085         }
1086
1087         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1088                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1089
1090         *idx = htole64(hidx + 1);
1091
1092         return 0;
1093 }
1094
1095 static int link_entry_into_array_plus_one(JournalFile *f,
1096                                           le64_t *extra,
1097                                           le64_t *first,
1098                                           le64_t *idx,
1099                                           uint64_t p) {
1100
1101         int r;
1102
1103         assert(f);
1104         assert(extra);
1105         assert(first);
1106         assert(idx);
1107         assert(p > 0);
1108
1109         if (*idx == 0)
1110                 *extra = htole64(p);
1111         else {
1112                 le64_t i;
1113
1114                 i = htole64(le64toh(*idx) - 1);
1115                 r = link_entry_into_array(f, first, &i, p);
1116                 if (r < 0)
1117                         return r;
1118         }
1119
1120         *idx = htole64(le64toh(*idx) + 1);
1121         return 0;
1122 }
1123
1124 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1125         uint64_t p;
1126         int r;
1127         assert(f);
1128         assert(o);
1129         assert(offset > 0);
1130
1131         p = le64toh(o->entry.items[i].object_offset);
1132         if (p == 0)
1133                 return -EINVAL;
1134
1135         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1136         if (r < 0)
1137                 return r;
1138
1139         return link_entry_into_array_plus_one(f,
1140                                               &o->data.entry_offset,
1141                                               &o->data.entry_array_offset,
1142                                               &o->data.n_entries,
1143                                               offset);
1144 }
1145
1146 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1147         uint64_t n, i;
1148         int r;
1149
1150         assert(f);
1151         assert(o);
1152         assert(offset > 0);
1153
1154         if (o->object.type != OBJECT_ENTRY)
1155                 return -EINVAL;
1156
1157         __sync_synchronize();
1158
1159         /* Link up the entry itself */
1160         r = link_entry_into_array(f,
1161                                   &f->header->entry_array_offset,
1162                                   &f->header->n_entries,
1163                                   offset);
1164         if (r < 0)
1165                 return r;
1166
1167         /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
1168
1169         if (f->header->head_entry_realtime == 0)
1170                 f->header->head_entry_realtime = o->entry.realtime;
1171
1172         f->header->tail_entry_realtime = o->entry.realtime;
1173         f->header->tail_entry_monotonic = o->entry.monotonic;
1174
1175         f->tail_entry_monotonic_valid = true;
1176
1177         /* Link up the items */
1178         n = journal_file_entry_n_items(o);
1179         for (i = 0; i < n; i++) {
1180                 r = journal_file_link_entry_item(f, o, offset, i);
1181                 if (r < 0)
1182                         return r;
1183         }
1184
1185         return 0;
1186 }
1187
1188 static int journal_file_append_entry_internal(
1189                 JournalFile *f,
1190                 const dual_timestamp *ts,
1191                 uint64_t xor_hash,
1192                 const EntryItem items[], unsigned n_items,
1193                 uint64_t *seqnum,
1194                 Object **ret, uint64_t *offset) {
1195         uint64_t np;
1196         uint64_t osize;
1197         Object *o;
1198         int r;
1199
1200         assert(f);
1201         assert(items || n_items == 0);
1202         assert(ts);
1203
1204         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1205
1206         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1207         if (r < 0)
1208                 return r;
1209
1210         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1211         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1212         o->entry.realtime = htole64(ts->realtime);
1213         o->entry.monotonic = htole64(ts->monotonic);
1214         o->entry.xor_hash = htole64(xor_hash);
1215         o->entry.boot_id = f->header->boot_id;
1216
1217 #ifdef HAVE_GCRYPT
1218         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1219         if (r < 0)
1220                 return r;
1221 #endif
1222
1223         r = journal_file_link_entry(f, o, np);
1224         if (r < 0)
1225                 return r;
1226
1227         if (ret)
1228                 *ret = o;
1229
1230         if (offset)
1231                 *offset = np;
1232
1233         return 0;
1234 }
1235
1236 void journal_file_post_change(JournalFile *f) {
1237         assert(f);
1238
1239         /* inotify() does not receive IN_MODIFY events from file
1240          * accesses done via mmap(). After each access we hence
1241          * trigger IN_MODIFY by truncating the journal file to its
1242          * current size which triggers IN_MODIFY. */
1243
1244         __sync_synchronize();
1245
1246         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1247                 log_error("Failed to truncate file to its own size: %m");
1248 }
1249
1250 static int entry_item_cmp(const void *_a, const void *_b) {
1251         const EntryItem *a = _a, *b = _b;
1252
1253         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1254                 return -1;
1255         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1256                 return 1;
1257         return 0;
1258 }
1259
1260 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1261         unsigned i;
1262         EntryItem *items;
1263         int r;
1264         uint64_t xor_hash = 0;
1265         struct dual_timestamp _ts;
1266
1267         assert(f);
1268         assert(iovec || n_iovec == 0);
1269
1270         if (!f->writable)
1271                 return -EPERM;
1272
1273         if (!ts) {
1274                 dual_timestamp_get(&_ts);
1275                 ts = &_ts;
1276         }
1277
1278         if (f->tail_entry_monotonic_valid &&
1279             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1280                 return -EINVAL;
1281
1282 #ifdef HAVE_GCRYPT
1283         r = journal_file_maybe_append_tag(f, ts->realtime);
1284         if (r < 0)
1285                 return r;
1286 #endif
1287
1288         /* alloca() can't take 0, hence let's allocate at least one */
1289         items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1290
1291         for (i = 0; i < n_iovec; i++) {
1292                 uint64_t p;
1293                 Object *o;
1294
1295                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1296                 if (r < 0)
1297                         return r;
1298
1299                 xor_hash ^= le64toh(o->data.hash);
1300                 items[i].object_offset = htole64(p);
1301                 items[i].hash = o->data.hash;
1302         }
1303
1304         /* Order by the position on disk, in order to improve seek
1305          * times for rotating media. */
1306         qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1307
1308         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1309
1310         journal_file_post_change(f);
1311
1312         return r;
1313 }
1314
1315 typedef struct ChainCacheItem {
1316         uint64_t first; /* the array at the begin of the chain */
1317         uint64_t array; /* the cached array */
1318         uint64_t begin; /* the first item in the cached array */
1319         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1320 } ChainCacheItem;
1321
1322 static void chain_cache_put(
1323                 Hashmap *h,
1324                 ChainCacheItem *ci,
1325                 uint64_t first,
1326                 uint64_t array,
1327                 uint64_t begin,
1328                 uint64_t total) {
1329
1330         if (!ci) {
1331                 /* If the chain item to cache for this chain is the
1332                  * first one it's not worth caching anything */
1333                 if (array == first)
1334                         return;
1335
1336                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1337                         ci = hashmap_steal_first(h);
1338                 else {
1339                         ci = new(ChainCacheItem, 1);
1340                         if (!ci)
1341                                 return;
1342                 }
1343
1344                 ci->first = first;
1345
1346                 if (hashmap_put(h, &ci->first, ci) < 0) {
1347                         free(ci);
1348                         return;
1349                 }
1350         } else
1351                 assert(ci->first == first);
1352
1353         ci->array = array;
1354         ci->begin = begin;
1355         ci->total = total;
1356 }
1357
1358 static int generic_array_get(JournalFile *f,
1359                              uint64_t first,
1360                              uint64_t i,
1361                              Object **ret, uint64_t *offset) {
1362
1363         Object *o;
1364         uint64_t p = 0, a, t = 0;
1365         int r;
1366         ChainCacheItem *ci;
1367
1368         assert(f);
1369
1370         a = first;
1371
1372         /* Try the chain cache first */
1373         ci = hashmap_get(f->chain_cache, &first);
1374         if (ci && i > ci->total) {
1375                 a = ci->array;
1376                 i -= ci->total;
1377                 t = ci->total;
1378         }
1379
1380         while (a > 0) {
1381                 uint64_t k;
1382
1383                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1384                 if (r < 0)
1385                         return r;
1386
1387                 k = journal_file_entry_array_n_items(o);
1388                 if (i < k) {
1389                         p = le64toh(o->entry_array.items[i]);
1390                         goto found;
1391                 }
1392
1393                 i -= k;
1394                 t += k;
1395                 a = le64toh(o->entry_array.next_entry_array_offset);
1396         }
1397
1398         return 0;
1399
1400 found:
1401         /* Let's cache this item for the next invocation */
1402         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1403
1404         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1405         if (r < 0)
1406                 return r;
1407
1408         if (ret)
1409                 *ret = o;
1410
1411         if (offset)
1412                 *offset = p;
1413
1414         return 1;
1415 }
1416
1417 static int generic_array_get_plus_one(JournalFile *f,
1418                                       uint64_t extra,
1419                                       uint64_t first,
1420                                       uint64_t i,
1421                                       Object **ret, uint64_t *offset) {
1422
1423         Object *o;
1424
1425         assert(f);
1426
1427         if (i == 0) {
1428                 int r;
1429
1430                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1431                 if (r < 0)
1432                         return r;
1433
1434                 if (ret)
1435                         *ret = o;
1436
1437                 if (offset)
1438                         *offset = extra;
1439
1440                 return 1;
1441         }
1442
1443         return generic_array_get(f, first, i-1, ret, offset);
1444 }
1445
1446 enum {
1447         TEST_FOUND,
1448         TEST_LEFT,
1449         TEST_RIGHT
1450 };
1451
1452 static int generic_array_bisect(JournalFile *f,
1453                                 uint64_t first,
1454                                 uint64_t n,
1455                                 uint64_t needle,
1456                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1457                                 direction_t direction,
1458                                 Object **ret,
1459                                 uint64_t *offset,
1460                                 uint64_t *idx) {
1461
1462         uint64_t a, p, t = 0, i = 0, last_p = 0;
1463         bool subtract_one = false;
1464         Object *o, *array = NULL;
1465         int r;
1466         ChainCacheItem *ci;
1467
1468         assert(f);
1469         assert(test_object);
1470
1471         /* Start with the first array in the chain */
1472         a = first;
1473
1474         ci = hashmap_get(f->chain_cache, &first);
1475         if (ci && n > ci->total) {
1476                 /* Ah, we have iterated this bisection array chain
1477                  * previously! Let's see if we can skip ahead in the
1478                  * chain, as far as the last time. But we can't jump
1479                  * backwards in the chain, so let's check that
1480                  * first. */
1481
1482                 r = test_object(f, ci->begin, needle);
1483                 if (r < 0)
1484                         return r;
1485
1486                 if (r == TEST_LEFT) {
1487                         /* OK, what we are looking for is right of th
1488                          * begin of this EntryArray, so let's jump
1489                          * straight to previously cached array in the
1490                          * chain */
1491
1492                         a = ci->array;
1493                         n -= ci->total;
1494                         t = ci->total;
1495                 }
1496         }
1497
1498         while (a > 0) {
1499                 uint64_t left, right, k, lp;
1500
1501                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1502                 if (r < 0)
1503                         return r;
1504
1505                 k = journal_file_entry_array_n_items(array);
1506                 right = MIN(k, n);
1507                 if (right <= 0)
1508                         return 0;
1509
1510                 i = right - 1;
1511                 lp = p = le64toh(array->entry_array.items[i]);
1512                 if (p <= 0)
1513                         return -EBADMSG;
1514
1515                 r = test_object(f, p, needle);
1516                 if (r < 0)
1517                         return r;
1518
1519                 if (r == TEST_FOUND)
1520                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1521
1522                 if (r == TEST_RIGHT) {
1523                         left = 0;
1524                         right -= 1;
1525                         for (;;) {
1526                                 if (left == right) {
1527                                         if (direction == DIRECTION_UP)
1528                                                 subtract_one = true;
1529
1530                                         i = left;
1531                                         goto found;
1532                                 }
1533
1534                                 assert(left < right);
1535
1536                                 i = (left + right) / 2;
1537                                 p = le64toh(array->entry_array.items[i]);
1538                                 if (p <= 0)
1539                                         return -EBADMSG;
1540
1541                                 r = test_object(f, p, needle);
1542                                 if (r < 0)
1543                                         return r;
1544
1545                                 if (r == TEST_FOUND)
1546                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1547
1548                                 if (r == TEST_RIGHT)
1549                                         right = i;
1550                                 else
1551                                         left = i + 1;
1552                         }
1553                 }
1554
1555                 if (k > n) {
1556                         if (direction == DIRECTION_UP) {
1557                                 i = n;
1558                                 subtract_one = true;
1559                                 goto found;
1560                         }
1561
1562                         return 0;
1563                 }
1564
1565                 last_p = lp;
1566
1567                 n -= k;
1568                 t += k;
1569                 a = le64toh(array->entry_array.next_entry_array_offset);
1570         }
1571
1572         return 0;
1573
1574 found:
1575         if (subtract_one && t == 0 && i == 0)
1576                 return 0;
1577
1578         /* Let's cache this item for the next invocation */
1579         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1580
1581         if (subtract_one && i == 0)
1582                 p = last_p;
1583         else if (subtract_one)
1584                 p = le64toh(array->entry_array.items[i-1]);
1585         else
1586                 p = le64toh(array->entry_array.items[i]);
1587
1588         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1589         if (r < 0)
1590                 return r;
1591
1592         if (ret)
1593                 *ret = o;
1594
1595         if (offset)
1596                 *offset = p;
1597
1598         if (idx)
1599                 *idx = t + i + (subtract_one ? -1 : 0);
1600
1601         return 1;
1602 }
1603
1604 static int generic_array_bisect_plus_one(JournalFile *f,
1605                                          uint64_t extra,
1606                                          uint64_t first,
1607                                          uint64_t n,
1608                                          uint64_t needle,
1609                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1610                                          direction_t direction,
1611                                          Object **ret,
1612                                          uint64_t *offset,
1613                                          uint64_t *idx) {
1614
1615         int r;
1616         bool step_back = false;
1617         Object *o;
1618
1619         assert(f);
1620         assert(test_object);
1621
1622         if (n <= 0)
1623                 return 0;
1624
1625         /* This bisects the array in object 'first', but first checks
1626          * an extra  */
1627         r = test_object(f, extra, needle);
1628         if (r < 0)
1629                 return r;
1630
1631         if (r == TEST_FOUND)
1632                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1633
1634         /* if we are looking with DIRECTION_UP then we need to first
1635            see if in the actual array there is a matching entry, and
1636            return the last one of that. But if there isn't any we need
1637            to return this one. Hence remember this, and return it
1638            below. */
1639         if (r == TEST_LEFT)
1640                 step_back = direction == DIRECTION_UP;
1641
1642         if (r == TEST_RIGHT) {
1643                 if (direction == DIRECTION_DOWN)
1644                         goto found;
1645                 else
1646                         return 0;
1647         }
1648
1649         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1650
1651         if (r == 0 && step_back)
1652                 goto found;
1653
1654         if (r > 0 && idx)
1655                 (*idx) ++;
1656
1657         return r;
1658
1659 found:
1660         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1661         if (r < 0)
1662                 return r;
1663
1664         if (ret)
1665                 *ret = o;
1666
1667         if (offset)
1668                 *offset = extra;
1669
1670         if (idx)
1671                 *idx = 0;
1672
1673         return 1;
1674 }
1675
1676 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1677         assert(f);
1678         assert(p > 0);
1679
1680         if (p == needle)
1681                 return TEST_FOUND;
1682         else if (p < needle)
1683                 return TEST_LEFT;
1684         else
1685                 return TEST_RIGHT;
1686 }
1687
1688 int journal_file_move_to_entry_by_offset(
1689                 JournalFile *f,
1690                 uint64_t p,
1691                 direction_t direction,
1692                 Object **ret,
1693                 uint64_t *offset) {
1694
1695         return generic_array_bisect(f,
1696                                     le64toh(f->header->entry_array_offset),
1697                                     le64toh(f->header->n_entries),
1698                                     p,
1699                                     test_object_offset,
1700                                     direction,
1701                                     ret, offset, NULL);
1702 }
1703
1704
1705 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1706         Object *o;
1707         int r;
1708
1709         assert(f);
1710         assert(p > 0);
1711
1712         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1713         if (r < 0)
1714                 return r;
1715
1716         if (le64toh(o->entry.seqnum) == needle)
1717                 return TEST_FOUND;
1718         else if (le64toh(o->entry.seqnum) < needle)
1719                 return TEST_LEFT;
1720         else
1721                 return TEST_RIGHT;
1722 }
1723
1724 int journal_file_move_to_entry_by_seqnum(
1725                 JournalFile *f,
1726                 uint64_t seqnum,
1727                 direction_t direction,
1728                 Object **ret,
1729                 uint64_t *offset) {
1730
1731         return generic_array_bisect(f,
1732                                     le64toh(f->header->entry_array_offset),
1733                                     le64toh(f->header->n_entries),
1734                                     seqnum,
1735                                     test_object_seqnum,
1736                                     direction,
1737                                     ret, offset, NULL);
1738 }
1739
1740 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1741         Object *o;
1742         int r;
1743
1744         assert(f);
1745         assert(p > 0);
1746
1747         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1748         if (r < 0)
1749                 return r;
1750
1751         if (le64toh(o->entry.realtime) == needle)
1752                 return TEST_FOUND;
1753         else if (le64toh(o->entry.realtime) < needle)
1754                 return TEST_LEFT;
1755         else
1756                 return TEST_RIGHT;
1757 }
1758
1759 int journal_file_move_to_entry_by_realtime(
1760                 JournalFile *f,
1761                 uint64_t realtime,
1762                 direction_t direction,
1763                 Object **ret,
1764                 uint64_t *offset) {
1765
1766         return generic_array_bisect(f,
1767                                     le64toh(f->header->entry_array_offset),
1768                                     le64toh(f->header->n_entries),
1769                                     realtime,
1770                                     test_object_realtime,
1771                                     direction,
1772                                     ret, offset, NULL);
1773 }
1774
1775 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1776         Object *o;
1777         int r;
1778
1779         assert(f);
1780         assert(p > 0);
1781
1782         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1783         if (r < 0)
1784                 return r;
1785
1786         if (le64toh(o->entry.monotonic) == needle)
1787                 return TEST_FOUND;
1788         else if (le64toh(o->entry.monotonic) < needle)
1789                 return TEST_LEFT;
1790         else
1791                 return TEST_RIGHT;
1792 }
1793
1794 int journal_file_move_to_entry_by_monotonic(
1795                 JournalFile *f,
1796                 sd_id128_t boot_id,
1797                 uint64_t monotonic,
1798                 direction_t direction,
1799                 Object **ret,
1800                 uint64_t *offset) {
1801
1802         char t[9+32+1] = "_BOOT_ID=";
1803         Object *o;
1804         int r;
1805
1806         assert(f);
1807
1808         sd_id128_to_string(boot_id, t + 9);
1809         r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1810         if (r < 0)
1811                 return r;
1812         if (r == 0)
1813                 return -ENOENT;
1814
1815         return generic_array_bisect_plus_one(f,
1816                                              le64toh(o->data.entry_offset),
1817                                              le64toh(o->data.entry_array_offset),
1818                                              le64toh(o->data.n_entries),
1819                                              monotonic,
1820                                              test_object_monotonic,
1821                                              direction,
1822                                              ret, offset, NULL);
1823 }
1824
1825 int journal_file_next_entry(
1826                 JournalFile *f,
1827                 Object *o, uint64_t p,
1828                 direction_t direction,
1829                 Object **ret, uint64_t *offset) {
1830
1831         uint64_t i, n;
1832         int r;
1833
1834         assert(f);
1835         assert(p > 0 || !o);
1836
1837         n = le64toh(f->header->n_entries);
1838         if (n <= 0)
1839                 return 0;
1840
1841         if (!o)
1842                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1843         else {
1844                 if (o->object.type != OBJECT_ENTRY)
1845                         return -EINVAL;
1846
1847                 r = generic_array_bisect(f,
1848                                          le64toh(f->header->entry_array_offset),
1849                                          le64toh(f->header->n_entries),
1850                                          p,
1851                                          test_object_offset,
1852                                          DIRECTION_DOWN,
1853                                          NULL, NULL,
1854                                          &i);
1855                 if (r <= 0)
1856                         return r;
1857
1858                 if (direction == DIRECTION_DOWN) {
1859                         if (i >= n - 1)
1860                                 return 0;
1861
1862                         i++;
1863                 } else {
1864                         if (i <= 0)
1865                                 return 0;
1866
1867                         i--;
1868                 }
1869         }
1870
1871         /* And jump to it */
1872         return generic_array_get(f,
1873                                  le64toh(f->header->entry_array_offset),
1874                                  i,
1875                                  ret, offset);
1876 }
1877
1878 int journal_file_skip_entry(
1879                 JournalFile *f,
1880                 Object *o, uint64_t p,
1881                 int64_t skip,
1882                 Object **ret, uint64_t *offset) {
1883
1884         uint64_t i, n;
1885         int r;
1886
1887         assert(f);
1888         assert(o);
1889         assert(p > 0);
1890
1891         if (o->object.type != OBJECT_ENTRY)
1892                 return -EINVAL;
1893
1894         r = generic_array_bisect(f,
1895                                  le64toh(f->header->entry_array_offset),
1896                                  le64toh(f->header->n_entries),
1897                                  p,
1898                                  test_object_offset,
1899                                  DIRECTION_DOWN,
1900                                  NULL, NULL,
1901                                  &i);
1902         if (r <= 0)
1903                 return r;
1904
1905         /* Calculate new index */
1906         if (skip < 0) {
1907                 if ((uint64_t) -skip >= i)
1908                         i = 0;
1909                 else
1910                         i = i - (uint64_t) -skip;
1911         } else
1912                 i  += (uint64_t) skip;
1913
1914         n = le64toh(f->header->n_entries);
1915         if (n <= 0)
1916                 return -EBADMSG;
1917
1918         if (i >= n)
1919                 i = n-1;
1920
1921         return generic_array_get(f,
1922                                  le64toh(f->header->entry_array_offset),
1923                                  i,
1924                                  ret, offset);
1925 }
1926
1927 int journal_file_next_entry_for_data(
1928                 JournalFile *f,
1929                 Object *o, uint64_t p,
1930                 uint64_t data_offset,
1931                 direction_t direction,
1932                 Object **ret, uint64_t *offset) {
1933
1934         uint64_t n, i;
1935         int r;
1936         Object *d;
1937
1938         assert(f);
1939         assert(p > 0 || !o);
1940
1941         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1942         if (r < 0)
1943                 return r;
1944
1945         n = le64toh(d->data.n_entries);
1946         if (n <= 0)
1947                 return n;
1948
1949         if (!o)
1950                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1951         else {
1952                 if (o->object.type != OBJECT_ENTRY)
1953                         return -EINVAL;
1954
1955                 r = generic_array_bisect_plus_one(f,
1956                                                   le64toh(d->data.entry_offset),
1957                                                   le64toh(d->data.entry_array_offset),
1958                                                   le64toh(d->data.n_entries),
1959                                                   p,
1960                                                   test_object_offset,
1961                                                   DIRECTION_DOWN,
1962                                                   NULL, NULL,
1963                                                   &i);
1964
1965                 if (r <= 0)
1966                         return r;
1967
1968                 if (direction == DIRECTION_DOWN) {
1969                         if (i >= n - 1)
1970                                 return 0;
1971
1972                         i++;
1973                 } else {
1974                         if (i <= 0)
1975                                 return 0;
1976
1977                         i--;
1978                 }
1979
1980         }
1981
1982         return generic_array_get_plus_one(f,
1983                                           le64toh(d->data.entry_offset),
1984                                           le64toh(d->data.entry_array_offset),
1985                                           i,
1986                                           ret, offset);
1987 }
1988
1989 int journal_file_move_to_entry_by_offset_for_data(
1990                 JournalFile *f,
1991                 uint64_t data_offset,
1992                 uint64_t p,
1993                 direction_t direction,
1994                 Object **ret, uint64_t *offset) {
1995
1996         int r;
1997         Object *d;
1998
1999         assert(f);
2000
2001         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2002         if (r < 0)
2003                 return r;
2004
2005         return generic_array_bisect_plus_one(f,
2006                                              le64toh(d->data.entry_offset),
2007                                              le64toh(d->data.entry_array_offset),
2008                                              le64toh(d->data.n_entries),
2009                                              p,
2010                                              test_object_offset,
2011                                              direction,
2012                                              ret, offset, NULL);
2013 }
2014
2015 int journal_file_move_to_entry_by_monotonic_for_data(
2016                 JournalFile *f,
2017                 uint64_t data_offset,
2018                 sd_id128_t boot_id,
2019                 uint64_t monotonic,
2020                 direction_t direction,
2021                 Object **ret, uint64_t *offset) {
2022
2023         char t[9+32+1] = "_BOOT_ID=";
2024         Object *o, *d;
2025         int r;
2026         uint64_t b, z;
2027
2028         assert(f);
2029
2030         /* First, seek by time */
2031         sd_id128_to_string(boot_id, t + 9);
2032         r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
2033         if (r < 0)
2034                 return r;
2035         if (r == 0)
2036                 return -ENOENT;
2037
2038         r = generic_array_bisect_plus_one(f,
2039                                           le64toh(o->data.entry_offset),
2040                                           le64toh(o->data.entry_array_offset),
2041                                           le64toh(o->data.n_entries),
2042                                           monotonic,
2043                                           test_object_monotonic,
2044                                           direction,
2045                                           NULL, &z, NULL);
2046         if (r <= 0)
2047                 return r;
2048
2049         /* And now, continue seeking until we find an entry that
2050          * exists in both bisection arrays */
2051
2052         for (;;) {
2053                 Object *qo;
2054                 uint64_t p, q;
2055
2056                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2057                 if (r < 0)
2058                         return r;
2059
2060                 r = generic_array_bisect_plus_one(f,
2061                                                   le64toh(d->data.entry_offset),
2062                                                   le64toh(d->data.entry_array_offset),
2063                                                   le64toh(d->data.n_entries),
2064                                                   z,
2065                                                   test_object_offset,
2066                                                   direction,
2067                                                   NULL, &p, NULL);
2068                 if (r <= 0)
2069                         return r;
2070
2071                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2072                 if (r < 0)
2073                         return r;
2074
2075                 r = generic_array_bisect_plus_one(f,
2076                                                   le64toh(o->data.entry_offset),
2077                                                   le64toh(o->data.entry_array_offset),
2078                                                   le64toh(o->data.n_entries),
2079                                                   p,
2080                                                   test_object_offset,
2081                                                   direction,
2082                                                   &qo, &q, NULL);
2083
2084                 if (r <= 0)
2085                         return r;
2086
2087                 if (p == q) {
2088                         if (ret)
2089                                 *ret = qo;
2090                         if (offset)
2091                                 *offset = q;
2092
2093                         return 1;
2094                 }
2095
2096                 z = q;
2097         }
2098
2099         return 0;
2100 }
2101
2102 int journal_file_move_to_entry_by_seqnum_for_data(
2103                 JournalFile *f,
2104                 uint64_t data_offset,
2105                 uint64_t seqnum,
2106                 direction_t direction,
2107                 Object **ret, uint64_t *offset) {
2108
2109         Object *d;
2110         int r;
2111
2112         assert(f);
2113
2114         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2115         if (r < 0)
2116                 return r;
2117
2118         return generic_array_bisect_plus_one(f,
2119                                              le64toh(d->data.entry_offset),
2120                                              le64toh(d->data.entry_array_offset),
2121                                              le64toh(d->data.n_entries),
2122                                              seqnum,
2123                                              test_object_seqnum,
2124                                              direction,
2125                                              ret, offset, NULL);
2126 }
2127
2128 int journal_file_move_to_entry_by_realtime_for_data(
2129                 JournalFile *f,
2130                 uint64_t data_offset,
2131                 uint64_t realtime,
2132                 direction_t direction,
2133                 Object **ret, uint64_t *offset) {
2134
2135         Object *d;
2136         int r;
2137
2138         assert(f);
2139
2140         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2141         if (r < 0)
2142                 return r;
2143
2144         return generic_array_bisect_plus_one(f,
2145                                              le64toh(d->data.entry_offset),
2146                                              le64toh(d->data.entry_array_offset),
2147                                              le64toh(d->data.n_entries),
2148                                              realtime,
2149                                              test_object_realtime,
2150                                              direction,
2151                                              ret, offset, NULL);
2152 }
2153
2154 void journal_file_dump(JournalFile *f) {
2155         Object *o;
2156         int r;
2157         uint64_t p;
2158
2159         assert(f);
2160
2161         journal_file_print_header(f);
2162
2163         p = le64toh(f->header->header_size);
2164         while (p != 0) {
2165                 r = journal_file_move_to_object(f, -1, p, &o);
2166                 if (r < 0)
2167                         goto fail;
2168
2169                 switch (o->object.type) {
2170
2171                 case OBJECT_UNUSED:
2172                         printf("Type: OBJECT_UNUSED\n");
2173                         break;
2174
2175                 case OBJECT_DATA:
2176                         printf("Type: OBJECT_DATA\n");
2177                         break;
2178
2179                 case OBJECT_FIELD:
2180                         printf("Type: OBJECT_FIELD\n");
2181                         break;
2182
2183                 case OBJECT_ENTRY:
2184                         printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
2185                                (unsigned long long) le64toh(o->entry.seqnum),
2186                                (unsigned long long) le64toh(o->entry.monotonic),
2187                                (unsigned long long) le64toh(o->entry.realtime));
2188                         break;
2189
2190                 case OBJECT_FIELD_HASH_TABLE:
2191                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2192                         break;
2193
2194                 case OBJECT_DATA_HASH_TABLE:
2195                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2196                         break;
2197
2198                 case OBJECT_ENTRY_ARRAY:
2199                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2200                         break;
2201
2202                 case OBJECT_TAG:
2203                         printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
2204                                (unsigned long long) le64toh(o->tag.seqnum),
2205                                (unsigned long long) le64toh(o->tag.epoch));
2206                         break;
2207
2208                 default:
2209                         printf("Type: unknown (%u)\n", o->object.type);
2210                         break;
2211                 }
2212
2213                 if (o->object.flags & OBJECT_COMPRESSED)
2214                         printf("Flags: COMPRESSED\n");
2215
2216                 if (p == le64toh(f->header->tail_object_offset))
2217                         p = 0;
2218                 else
2219                         p = p + ALIGN64(le64toh(o->object.size));
2220         }
2221
2222         return;
2223 fail:
2224         log_error("File corrupt");
2225 }
2226
2227 void journal_file_print_header(JournalFile *f) {
2228         char a[33], b[33], c[33];
2229         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2230         struct stat st;
2231         char bytes[FORMAT_BYTES_MAX];
2232
2233         assert(f);
2234
2235         printf("File Path: %s\n"
2236                "File ID: %s\n"
2237                "Machine ID: %s\n"
2238                "Boot ID: %s\n"
2239                "Sequential Number ID: %s\n"
2240                "State: %s\n"
2241                "Compatible Flags:%s%s\n"
2242                "Incompatible Flags:%s%s\n"
2243                "Header size: %llu\n"
2244                "Arena size: %llu\n"
2245                "Data Hash Table Size: %llu\n"
2246                "Field Hash Table Size: %llu\n"
2247                "Rotate Suggested: %s\n"
2248                "Head Sequential Number: %llu\n"
2249                "Tail Sequential Number: %llu\n"
2250                "Head Realtime Timestamp: %s\n"
2251                "Tail Realtime Timestamp: %s\n"
2252                "Objects: %llu\n"
2253                "Entry Objects: %llu\n",
2254                f->path,
2255                sd_id128_to_string(f->header->file_id, a),
2256                sd_id128_to_string(f->header->machine_id, b),
2257                sd_id128_to_string(f->header->boot_id, c),
2258                sd_id128_to_string(f->header->seqnum_id, c),
2259                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2260                f->header->state == STATE_ONLINE ? "ONLINE" :
2261                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2262                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2263                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2264                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2265                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2266                (unsigned long long) le64toh(f->header->header_size),
2267                (unsigned long long) le64toh(f->header->arena_size),
2268                (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2269                (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2270                yes_no(journal_file_rotate_suggested(f, 0)),
2271                (unsigned long long) le64toh(f->header->head_entry_seqnum),
2272                (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2273                format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2274                format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2275                (unsigned long long) le64toh(f->header->n_objects),
2276                (unsigned long long) le64toh(f->header->n_entries));
2277
2278         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2279                 printf("Data Objects: %llu\n"
2280                        "Data Hash Table Fill: %.1f%%\n",
2281                        (unsigned long long) le64toh(f->header->n_data),
2282                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2283
2284         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2285                 printf("Field Objects: %llu\n"
2286                        "Field Hash Table Fill: %.1f%%\n",
2287                        (unsigned long long) le64toh(f->header->n_fields),
2288                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2289
2290         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2291                 printf("Tag Objects: %llu\n",
2292                        (unsigned long long) le64toh(f->header->n_tags));
2293         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2294                 printf("Entry Array Objects: %llu\n",
2295                        (unsigned long long) le64toh(f->header->n_entry_arrays));
2296
2297         if (fstat(f->fd, &st) >= 0)
2298                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2299 }
2300
2301 int journal_file_open(
2302                 const char *fname,
2303                 int flags,
2304                 mode_t mode,
2305                 bool compress,
2306                 bool seal,
2307                 JournalMetrics *metrics,
2308                 MMapCache *mmap_cache,
2309                 JournalFile *template,
2310                 JournalFile **ret) {
2311
2312         JournalFile *f;
2313         int r;
2314         bool newly_created = false;
2315
2316         assert(fname);
2317         assert(ret);
2318
2319         if ((flags & O_ACCMODE) != O_RDONLY &&
2320             (flags & O_ACCMODE) != O_RDWR)
2321                 return -EINVAL;
2322
2323         if (!endswith(fname, ".journal") &&
2324             !endswith(fname, ".journal~"))
2325                 return -EINVAL;
2326
2327         f = new0(JournalFile, 1);
2328         if (!f)
2329                 return -ENOMEM;
2330
2331         f->fd = -1;
2332         f->mode = mode;
2333
2334         f->flags = flags;
2335         f->prot = prot_from_flags(flags);
2336         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2337 #ifdef HAVE_XZ
2338         f->compress = compress;
2339 #endif
2340 #ifdef HAVE_GCRYPT
2341         f->seal = seal;
2342 #endif
2343
2344         if (mmap_cache)
2345                 f->mmap = mmap_cache_ref(mmap_cache);
2346         else {
2347                 f->mmap = mmap_cache_new();
2348                 if (!f->mmap) {
2349                         r = -ENOMEM;
2350                         goto fail;
2351                 }
2352         }
2353
2354         f->path = strdup(fname);
2355         if (!f->path) {
2356                 r = -ENOMEM;
2357                 goto fail;
2358         }
2359
2360         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2361         if (!f->chain_cache) {
2362                 r = -ENOMEM;
2363                 goto fail;
2364         }
2365
2366         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2367         if (f->fd < 0) {
2368                 r = -errno;
2369                 goto fail;
2370         }
2371
2372         if (fstat(f->fd, &f->last_stat) < 0) {
2373                 r = -errno;
2374                 goto fail;
2375         }
2376
2377         if (f->last_stat.st_size == 0 && f->writable) {
2378 #ifdef HAVE_XATTR
2379                 uint64_t crtime;
2380
2381                 /* Let's attach the creation time to the journal file,
2382                  * so that the vacuuming code knows the age of this
2383                  * file even if the file might end up corrupted one
2384                  * day... Ideally we'd just use the creation time many
2385                  * file systems maintain for each file, but there is
2386                  * currently no usable API to query this, hence let's
2387                  * emulate this via extended attributes. If extended
2388                  * attributes are not supported we'll just skip this,
2389                  * and rely solely on mtime/atime/ctime of the file.*/
2390
2391                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2392                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2393 #endif
2394
2395 #ifdef HAVE_GCRYPT
2396                 /* Try to load the FSPRG state, and if we can't, then
2397                  * just don't do sealing */
2398                 if (f->seal) {
2399                         r = journal_file_fss_load(f);
2400                         if (r < 0)
2401                                 f->seal = false;
2402                 }
2403 #endif
2404
2405                 r = journal_file_init_header(f, template);
2406                 if (r < 0)
2407                         goto fail;
2408
2409                 if (fstat(f->fd, &f->last_stat) < 0) {
2410                         r = -errno;
2411                         goto fail;
2412                 }
2413
2414                 newly_created = true;
2415         }
2416
2417         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2418                 r = -EIO;
2419                 goto fail;
2420         }
2421
2422         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2423         if (f->header == MAP_FAILED) {
2424                 f->header = NULL;
2425                 r = -errno;
2426                 goto fail;
2427         }
2428
2429         if (!newly_created) {
2430                 r = journal_file_verify_header(f);
2431                 if (r < 0)
2432                         goto fail;
2433         }
2434
2435 #ifdef HAVE_GCRYPT
2436         if (!newly_created && f->writable) {
2437                 r = journal_file_fss_load(f);
2438                 if (r < 0)
2439                         goto fail;
2440         }
2441 #endif
2442
2443         if (f->writable) {
2444                 if (metrics) {
2445                         journal_default_metrics(metrics, f->fd);
2446                         f->metrics = *metrics;
2447                 } else if (template)
2448                         f->metrics = template->metrics;
2449
2450                 r = journal_file_refresh_header(f);
2451                 if (r < 0)
2452                         goto fail;
2453         }
2454
2455 #ifdef HAVE_GCRYPT
2456         r = journal_file_hmac_setup(f);
2457         if (r < 0)
2458                 goto fail;
2459 #endif
2460
2461         if (newly_created) {
2462                 r = journal_file_setup_field_hash_table(f);
2463                 if (r < 0)
2464                         goto fail;
2465
2466                 r = journal_file_setup_data_hash_table(f);
2467                 if (r < 0)
2468                         goto fail;
2469
2470 #ifdef HAVE_GCRYPT
2471                 r = journal_file_append_first_tag(f);
2472                 if (r < 0)
2473                         goto fail;
2474 #endif
2475         }
2476
2477         r = journal_file_map_field_hash_table(f);
2478         if (r < 0)
2479                 goto fail;
2480
2481         r = journal_file_map_data_hash_table(f);
2482         if (r < 0)
2483                 goto fail;
2484
2485         *ret = f;
2486         return 0;
2487
2488 fail:
2489         journal_file_close(f);
2490
2491         return r;
2492 }
2493
2494 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2495         char *p;
2496         size_t l;
2497         JournalFile *old_file, *new_file = NULL;
2498         int r;
2499
2500         assert(f);
2501         assert(*f);
2502
2503         old_file = *f;
2504
2505         if (!old_file->writable)
2506                 return -EINVAL;
2507
2508         if (!endswith(old_file->path, ".journal"))
2509                 return -EINVAL;
2510
2511         l = strlen(old_file->path);
2512
2513         p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2514         if (!p)
2515                 return -ENOMEM;
2516
2517         memcpy(p, old_file->path, l - 8);
2518         p[l-8] = '@';
2519         sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2520         snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2521                  "-%016llx-%016llx.journal",
2522                  (unsigned long long) le64toh((*f)->header->head_entry_seqnum),
2523                  (unsigned long long) le64toh((*f)->header->head_entry_realtime));
2524
2525         r = rename(old_file->path, p);
2526         free(p);
2527
2528         if (r < 0)
2529                 return -errno;
2530
2531         old_file->header->state = STATE_ARCHIVED;
2532
2533         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2534         journal_file_close(old_file);
2535
2536         *f = new_file;
2537         return r;
2538 }
2539
2540 int journal_file_open_reliably(
2541                 const char *fname,
2542                 int flags,
2543                 mode_t mode,
2544                 bool compress,
2545                 bool seal,
2546                 JournalMetrics *metrics,
2547                 MMapCache *mmap_cache,
2548                 JournalFile *template,
2549                 JournalFile **ret) {
2550
2551         int r;
2552         size_t l;
2553         char *p;
2554
2555         r = journal_file_open(fname, flags, mode, compress, seal,
2556                               metrics, mmap_cache, template, ret);
2557         if (r != -EBADMSG && /* corrupted */
2558             r != -ENODATA && /* truncated */
2559             r != -EHOSTDOWN && /* other machine */
2560             r != -EPROTONOSUPPORT && /* incompatible feature */
2561             r != -EBUSY && /* unclean shutdown */
2562             r != -ESHUTDOWN /* already archived */)
2563                 return r;
2564
2565         if ((flags & O_ACCMODE) == O_RDONLY)
2566                 return r;
2567
2568         if (!(flags & O_CREAT))
2569                 return r;
2570
2571         if (!endswith(fname, ".journal"))
2572                 return r;
2573
2574         /* The file is corrupted. Rotate it away and try it again (but only once) */
2575
2576         l = strlen(fname);
2577         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2578                      (int) (l-8), fname,
2579                      (unsigned long long) now(CLOCK_REALTIME),
2580                      random_ull()) < 0)
2581                 return -ENOMEM;
2582
2583         r = rename(fname, p);
2584         free(p);
2585         if (r < 0)
2586                 return -errno;
2587
2588         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2589
2590         return journal_file_open(fname, flags, mode, compress, seal,
2591                                  metrics, mmap_cache, template, ret);
2592 }
2593
2594
2595 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2596         uint64_t i, n;
2597         uint64_t q, xor_hash = 0;
2598         int r;
2599         EntryItem *items;
2600         dual_timestamp ts;
2601
2602         assert(from);
2603         assert(to);
2604         assert(o);
2605         assert(p);
2606
2607         if (!to->writable)
2608                 return -EPERM;
2609
2610         ts.monotonic = le64toh(o->entry.monotonic);
2611         ts.realtime = le64toh(o->entry.realtime);
2612
2613         if (to->tail_entry_monotonic_valid &&
2614             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2615                 return -EINVAL;
2616
2617         n = journal_file_entry_n_items(o);
2618         items = alloca(sizeof(EntryItem) * n);
2619
2620         for (i = 0; i < n; i++) {
2621                 uint64_t l, h;
2622                 le64_t le_hash;
2623                 size_t t;
2624                 void *data;
2625                 Object *u;
2626
2627                 q = le64toh(o->entry.items[i].object_offset);
2628                 le_hash = o->entry.items[i].hash;
2629
2630                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2631                 if (r < 0)
2632                         return r;
2633
2634                 if (le_hash != o->data.hash)
2635                         return -EBADMSG;
2636
2637                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2638                 t = (size_t) l;
2639
2640                 /* We hit the limit on 32bit machines */
2641                 if ((uint64_t) t != l)
2642                         return -E2BIG;
2643
2644                 if (o->object.flags & OBJECT_COMPRESSED) {
2645 #ifdef HAVE_XZ
2646                         uint64_t rsize;
2647
2648                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2649                                 return -EBADMSG;
2650
2651                         data = from->compress_buffer;
2652                         l = rsize;
2653 #else
2654                         return -EPROTONOSUPPORT;
2655 #endif
2656                 } else
2657                         data = o->data.payload;
2658
2659                 r = journal_file_append_data(to, data, l, &u, &h);
2660                 if (r < 0)
2661                         return r;
2662
2663                 xor_hash ^= le64toh(u->data.hash);
2664                 items[i].object_offset = htole64(h);
2665                 items[i].hash = u->data.hash;
2666
2667                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2668                 if (r < 0)
2669                         return r;
2670         }
2671
2672         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2673 }
2674
2675 void journal_default_metrics(JournalMetrics *m, int fd) {
2676         uint64_t fs_size = 0;
2677         struct statvfs ss;
2678         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2679
2680         assert(m);
2681         assert(fd >= 0);
2682
2683         if (fstatvfs(fd, &ss) >= 0)
2684                 fs_size = ss.f_frsize * ss.f_blocks;
2685
2686         if (m->max_use == (uint64_t) -1) {
2687
2688                 if (fs_size > 0) {
2689                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2690
2691                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2692                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2693
2694                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2695                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2696                 } else
2697                         m->max_use = DEFAULT_MAX_USE_LOWER;
2698         } else {
2699                 m->max_use = PAGE_ALIGN(m->max_use);
2700
2701                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2702                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2703         }
2704
2705         if (m->max_size == (uint64_t) -1) {
2706                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2707
2708                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2709                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2710         } else
2711                 m->max_size = PAGE_ALIGN(m->max_size);
2712
2713         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2714                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2715
2716         if (m->max_size*2 > m->max_use)
2717                 m->max_use = m->max_size*2;
2718
2719         if (m->min_size == (uint64_t) -1)
2720                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2721         else {
2722                 m->min_size = PAGE_ALIGN(m->min_size);
2723
2724                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2725                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2726
2727                 if (m->min_size > m->max_size)
2728                         m->max_size = m->min_size;
2729         }
2730
2731         if (m->keep_free == (uint64_t) -1) {
2732
2733                 if (fs_size > 0) {
2734                         m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2735
2736                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2737                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2738
2739                 } else
2740                         m->keep_free = DEFAULT_KEEP_FREE;
2741         }
2742
2743         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2744                   format_bytes(a, sizeof(a), m->max_use),
2745                   format_bytes(b, sizeof(b), m->max_size),
2746                   format_bytes(c, sizeof(c), m->min_size),
2747                   format_bytes(d, sizeof(d), m->keep_free));
2748 }
2749
2750 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2751         assert(f);
2752         assert(from || to);
2753
2754         if (from) {
2755                 if (f->header->head_entry_realtime == 0)
2756                         return -ENOENT;
2757
2758                 *from = le64toh(f->header->head_entry_realtime);
2759         }
2760
2761         if (to) {
2762                 if (f->header->tail_entry_realtime == 0)
2763                         return -ENOENT;
2764
2765                 *to = le64toh(f->header->tail_entry_realtime);
2766         }
2767
2768         return 1;
2769 }
2770
2771 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2772         char t[9+32+1] = "_BOOT_ID=";
2773         Object *o;
2774         uint64_t p;
2775         int r;
2776
2777         assert(f);
2778         assert(from || to);
2779
2780         sd_id128_to_string(boot_id, t + 9);
2781
2782         r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2783         if (r <= 0)
2784                 return r;
2785
2786         if (le64toh(o->data.n_entries) <= 0)
2787                 return 0;
2788
2789         if (from) {
2790                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2791                 if (r < 0)
2792                         return r;
2793
2794                 *from = le64toh(o->entry.monotonic);
2795         }
2796
2797         if (to) {
2798                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2799                 if (r < 0)
2800                         return r;
2801
2802                 r = generic_array_get_plus_one(f,
2803                                                le64toh(o->data.entry_offset),
2804                                                le64toh(o->data.entry_array_offset),
2805                                                le64toh(o->data.n_entries)-1,
2806                                                &o, NULL);
2807                 if (r <= 0)
2808                         return r;
2809
2810                 *to = le64toh(o->entry.monotonic);
2811         }
2812
2813         return 1;
2814 }
2815
2816 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2817         assert(f);
2818
2819         /* If we gained new header fields we gained new features,
2820          * hence suggest a rotation */
2821         if (le64toh(f->header->header_size) < sizeof(Header)) {
2822                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2823                 return true;
2824         }
2825
2826         /* Let's check if the hash tables grew over a certain fill
2827          * level (75%, borrowing this value from Java's hash table
2828          * implementation), and if so suggest a rotation. To calculate
2829          * the fill level we need the n_data field, which only exists
2830          * in newer versions. */
2831
2832         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2833                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2834                         log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2835                                   f->path,
2836                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2837                                   (unsigned long long) le64toh(f->header->n_data),
2838                                   (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2839                                   (unsigned long long) (f->last_stat.st_size),
2840                                   (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2841                         return true;
2842                 }
2843
2844         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2845                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2846                         log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2847                                   f->path,
2848                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2849                                   (unsigned long long) le64toh(f->header->n_fields),
2850                                   (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2851                         return true;
2852                 }
2853
2854         /* Are the data objects properly indexed by field objects? */
2855         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2856             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2857             le64toh(f->header->n_data) > 0 &&
2858             le64toh(f->header->n_fields) == 0)
2859                 return true;
2860
2861         if (max_file_usec > 0) {
2862                 usec_t t, h;
2863
2864                 h = le64toh(f->header->head_entry_realtime);
2865                 t = now(CLOCK_REALTIME);
2866
2867                 if (h > 0 && t > h + max_file_usec)
2868                         return true;
2869         }
2870
2871         return false;
2872 }