chiark / gitweb /
481c2423ccce31bfa00df55d742ae2feb7f43a02
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75         assert(f);
76
77         if (!f->writable)
78                 return -EPERM;
79
80         if (!(f->fd >= 0 && f->header))
81                 return -EINVAL;
82
83         switch(f->header->state) {
84                 case STATE_ONLINE:
85                         return 0;
86
87                 case STATE_OFFLINE:
88                         f->header->state = STATE_ONLINE;
89                         fsync(f->fd);
90                         return 0;
91
92                 default:
93                         return -EINVAL;
94         }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98         assert(f);
99
100         if (!f->writable)
101                 return -EPERM;
102
103         if (!(f->fd >= 0 && f->header))
104                 return -EINVAL;
105
106         if (f->header->state != STATE_ONLINE)
107                 return 0;
108
109         fsync(f->fd);
110
111         f->header->state = STATE_OFFLINE;
112
113         fsync(f->fd);
114
115         return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119         assert(f);
120
121 #ifdef HAVE_GCRYPT
122         /* Write the final tag */
123         if (f->seal && f->writable)
124                 journal_file_append_tag(f);
125 #endif
126
127         /* Sync everything to disk, before we mark the file offline */
128         if (f->mmap && f->fd >= 0)
129                 mmap_cache_close_fd(f->mmap, f->fd);
130
131         journal_file_set_offline(f);
132
133         if (f->header)
134                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136         if (f->fd >= 0)
137                 close_nointr_nofail(f->fd);
138
139         free(f->path);
140
141         if (f->mmap)
142                 mmap_cache_unref(f->mmap);
143
144         hashmap_free_free(f->chain_cache);
145
146 #ifdef HAVE_XZ
147         free(f->compress_buffer);
148 #endif
149
150 #ifdef HAVE_GCRYPT
151         if (f->fss_file)
152                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153         else if (f->fsprg_state)
154                 free(f->fsprg_state);
155
156         free(f->fsprg_seed);
157
158         if (f->hmac)
159                 gcry_md_close(f->hmac);
160 #endif
161
162         free(f);
163 }
164
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166         Header h;
167         ssize_t k;
168         int r;
169
170         assert(f);
171
172         zero(h);
173         memcpy(h.signature, HEADER_SIGNATURE, 8);
174         h.header_size = htole64(ALIGN64(sizeof(h)));
175
176         h.incompatible_flags =
177                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179         h.compatible_flags =
180                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
181
182         r = sd_id128_randomize(&h.file_id);
183         if (r < 0)
184                 return r;
185
186         if (template) {
187                 h.seqnum_id = template->header->seqnum_id;
188                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
189         } else
190                 h.seqnum_id = h.file_id;
191
192         k = pwrite(f->fd, &h, sizeof(h), 0);
193         if (k < 0)
194                 return -errno;
195
196         if (k != sizeof(h))
197                 return -EIO;
198
199         return 0;
200 }
201
202 static int journal_file_refresh_header(JournalFile *f) {
203         int r;
204         sd_id128_t boot_id;
205
206         assert(f);
207
208         r = sd_id128_get_machine(&f->header->machine_id);
209         if (r < 0)
210                 return r;
211
212         r = sd_id128_get_boot(&boot_id);
213         if (r < 0)
214                 return r;
215
216         if (sd_id128_equal(boot_id, f->header->boot_id))
217                 f->tail_entry_monotonic_valid = true;
218
219         f->header->boot_id = boot_id;
220
221         journal_file_set_online(f);
222
223         /* Sync the online state to disk */
224         fsync(f->fd);
225
226         return 0;
227 }
228
229 static int journal_file_verify_header(JournalFile *f) {
230         assert(f);
231
232         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233                 return -EBADMSG;
234
235         /* In both read and write mode we refuse to open files with
236          * incompatible flags we don't know */
237 #ifdef HAVE_XZ
238         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239                 return -EPROTONOSUPPORT;
240 #else
241         if (f->header->incompatible_flags != 0)
242                 return -EPROTONOSUPPORT;
243 #endif
244
245         /* When open for writing we refuse to open files with
246          * compatible flags, too */
247         if (f->writable) {
248 #ifdef HAVE_GCRYPT
249                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250                         return -EPROTONOSUPPORT;
251 #else
252                 if (f->header->compatible_flags != 0)
253                         return -EPROTONOSUPPORT;
254 #endif
255         }
256
257         if (f->header->state >= _STATE_MAX)
258                 return -EBADMSG;
259
260         /* The first addition was n_data, so check that we are at least this large */
261         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262                 return -EBADMSG;
263
264         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265                 return -EBADMSG;
266
267         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268                 return -ENODATA;
269
270         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271                 return -ENODATA;
272
273         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275             !VALID64(le64toh(f->header->tail_object_offset)) ||
276             !VALID64(le64toh(f->header->entry_array_offset)))
277                 return -ENODATA;
278
279         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
283                 return -ENODATA;
284
285         if (f->writable) {
286                 uint8_t state;
287                 sd_id128_t machine_id;
288                 int r;
289
290                 r = sd_id128_get_machine(&machine_id);
291                 if (r < 0)
292                         return r;
293
294                 if (!sd_id128_equal(machine_id, f->header->machine_id))
295                         return -EHOSTDOWN;
296
297                 state = f->header->state;
298
299                 if (state == STATE_ONLINE) {
300                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301                         return -EBUSY;
302                 } else if (state == STATE_ARCHIVED)
303                         return -ESHUTDOWN;
304                 else if (state != STATE_OFFLINE) {
305                         log_debug("Journal file %s has unknown state %u.", f->path, state);
306                         return -EBUSY;
307                 }
308         }
309
310         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
311
312         f->seal = JOURNAL_HEADER_SEALED(f->header);
313
314         return 0;
315 }
316
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318         uint64_t old_size, new_size, file_size;
319         int r;
320
321         assert(f);
322
323         /* We assume that this file is not sparse, and we know that
324          * for sure, since we always call posix_fallocate()
325          * ourselves */
326
327         old_size =
328                 le64toh(f->header->header_size) +
329                 le64toh(f->header->arena_size);
330
331         new_size = PAGE_ALIGN(offset + size);
332         if (new_size < le64toh(f->header->header_size))
333                 new_size = le64toh(f->header->header_size);
334
335         if (new_size <= old_size)
336                 return 0;
337
338         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339                 return -E2BIG;
340
341         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Note that the glibc fallocate() fallback is very
360            inefficient, hence we try to minimize the allocation area
361            as we can. */
362         r = posix_fallocate(f->fd, old_size, new_size - old_size);
363         if (r != 0)
364                 return -r;
365
366         /* Increase the file size a bit further than this, so that we
367          * we can create larger memory maps to cache */
368         file_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369         if (file_size > (uint64_t) f->last_stat.st_size) {
370                 if (file_size > new_size)
371                         ftruncate(f->fd, file_size);
372
373                 if (fstat(f->fd, &f->last_stat) < 0)
374                         return -errno;
375         }
376
377         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
378
379         return 0;
380 }
381
382 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
383         assert(f);
384         assert(ret);
385
386         if (size <= 0)
387                 return -EINVAL;
388
389         /* Avoid SIGBUS on invalid accesses */
390         if (offset + size > (uint64_t) f->last_stat.st_size) {
391                 /* Hmm, out of range? Let's refresh the fstat() data
392                  * first, before we trust that check. */
393
394                 if (fstat(f->fd, &f->last_stat) < 0 ||
395                     offset + size > (uint64_t) f->last_stat.st_size)
396                         return -EADDRNOTAVAIL;
397         }
398
399         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
400 }
401
402 static uint64_t minimum_header_size(Object *o) {
403
404         static const uint64_t table[] = {
405                 [OBJECT_DATA] = sizeof(DataObject),
406                 [OBJECT_FIELD] = sizeof(FieldObject),
407                 [OBJECT_ENTRY] = sizeof(EntryObject),
408                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411                 [OBJECT_TAG] = sizeof(TagObject),
412         };
413
414         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415                 return sizeof(ObjectHeader);
416
417         return table[o->object.type];
418 }
419
420 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
421         int r;
422         void *t;
423         Object *o;
424         uint64_t s;
425         unsigned context;
426
427         assert(f);
428         assert(ret);
429
430         /* Objects may only be located at multiple of 64 bit */
431         if (!VALID64(offset))
432                 return -EFAULT;
433
434         /* One context for each type, plus one catch-all for the rest */
435         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
436
437         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
438         if (r < 0)
439                 return r;
440
441         o = (Object*) t;
442         s = le64toh(o->object.size);
443
444         if (s < sizeof(ObjectHeader))
445                 return -EBADMSG;
446
447         if (o->object.type <= OBJECT_UNUSED)
448                 return -EBADMSG;
449
450         if (s < minimum_header_size(o))
451                 return -EBADMSG;
452
453         if (type > 0 && o->object.type != type)
454                 return -EBADMSG;
455
456         if (s > sizeof(ObjectHeader)) {
457                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
458                 if (r < 0)
459                         return r;
460
461                 o = (Object*) t;
462         }
463
464         *ret = o;
465         return 0;
466 }
467
468 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
469         uint64_t r;
470
471         assert(f);
472
473         r = le64toh(f->header->tail_entry_seqnum) + 1;
474
475         if (seqnum) {
476                 /* If an external seqnum counter was passed, we update
477                  * both the local and the external one, and set it to
478                  * the maximum of both */
479
480                 if (*seqnum + 1 > r)
481                         r = *seqnum + 1;
482
483                 *seqnum = r;
484         }
485
486         f->header->tail_entry_seqnum = htole64(r);
487
488         if (f->header->head_entry_seqnum == 0)
489                 f->header->head_entry_seqnum = htole64(r);
490
491         return r;
492 }
493
494 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
495         int r;
496         uint64_t p;
497         Object *tail, *o;
498         void *t;
499
500         assert(f);
501         assert(type > 0 && type < _OBJECT_TYPE_MAX);
502         assert(size >= sizeof(ObjectHeader));
503         assert(offset);
504         assert(ret);
505
506         r = journal_file_set_online(f);
507         if (r < 0)
508                 return r;
509
510         p = le64toh(f->header->tail_object_offset);
511         if (p == 0)
512                 p = le64toh(f->header->header_size);
513         else {
514                 r = journal_file_move_to_object(f, -1, p, &tail);
515                 if (r < 0)
516                         return r;
517
518                 p += ALIGN64(le64toh(tail->object.size));
519         }
520
521         r = journal_file_allocate(f, p, size);
522         if (r < 0)
523                 return r;
524
525         r = journal_file_move_to(f, type, false, p, size, &t);
526         if (r < 0)
527                 return r;
528
529         o = (Object*) t;
530
531         zero(o->object);
532         o->object.type = type;
533         o->object.size = htole64(size);
534
535         f->header->tail_object_offset = htole64(p);
536         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
537
538         *ret = o;
539         *offset = p;
540
541         return 0;
542 }
543
544 static int journal_file_setup_data_hash_table(JournalFile *f) {
545         uint64_t s, p;
546         Object *o;
547         int r;
548
549         assert(f);
550
551         /* We estimate that we need 1 hash table entry per 768 of
552            journal file and we want to make sure we never get beyond
553            75% fill level. Calculate the hash table size for the
554            maximum file size based on these metrics. */
555
556         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
557         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
558                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
559
560         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
561
562         r = journal_file_append_object(f,
563                                        OBJECT_DATA_HASH_TABLE,
564                                        offsetof(Object, hash_table.items) + s,
565                                        &o, &p);
566         if (r < 0)
567                 return r;
568
569         memset(o->hash_table.items, 0, s);
570
571         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
572         f->header->data_hash_table_size = htole64(s);
573
574         return 0;
575 }
576
577 static int journal_file_setup_field_hash_table(JournalFile *f) {
578         uint64_t s, p;
579         Object *o;
580         int r;
581
582         assert(f);
583
584         /* We use a fixed size hash table for the fields as this
585          * number should grow very slowly only */
586
587         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
588         r = journal_file_append_object(f,
589                                        OBJECT_FIELD_HASH_TABLE,
590                                        offsetof(Object, hash_table.items) + s,
591                                        &o, &p);
592         if (r < 0)
593                 return r;
594
595         memset(o->hash_table.items, 0, s);
596
597         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
598         f->header->field_hash_table_size = htole64(s);
599
600         return 0;
601 }
602
603 static int journal_file_map_data_hash_table(JournalFile *f) {
604         uint64_t s, p;
605         void *t;
606         int r;
607
608         assert(f);
609
610         p = le64toh(f->header->data_hash_table_offset);
611         s = le64toh(f->header->data_hash_table_size);
612
613         r = journal_file_move_to(f,
614                                  OBJECT_DATA_HASH_TABLE,
615                                  true,
616                                  p, s,
617                                  &t);
618         if (r < 0)
619                 return r;
620
621         f->data_hash_table = t;
622         return 0;
623 }
624
625 static int journal_file_map_field_hash_table(JournalFile *f) {
626         uint64_t s, p;
627         void *t;
628         int r;
629
630         assert(f);
631
632         p = le64toh(f->header->field_hash_table_offset);
633         s = le64toh(f->header->field_hash_table_size);
634
635         r = journal_file_move_to(f,
636                                  OBJECT_FIELD_HASH_TABLE,
637                                  true,
638                                  p, s,
639                                  &t);
640         if (r < 0)
641                 return r;
642
643         f->field_hash_table = t;
644         return 0;
645 }
646
647 static int journal_file_link_field(
648                 JournalFile *f,
649                 Object *o,
650                 uint64_t offset,
651                 uint64_t hash) {
652
653         uint64_t p, h;
654         int r;
655
656         assert(f);
657         assert(o);
658         assert(offset > 0);
659
660         if (o->object.type != OBJECT_FIELD)
661                 return -EINVAL;
662
663         /* This might alter the window we are looking at */
664
665         o->field.next_hash_offset = o->field.head_data_offset = 0;
666
667         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
668         p = le64toh(f->field_hash_table[h].tail_hash_offset);
669         if (p == 0)
670                 f->field_hash_table[h].head_hash_offset = htole64(offset);
671         else {
672                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
673                 if (r < 0)
674                         return r;
675
676                 o->field.next_hash_offset = htole64(offset);
677         }
678
679         f->field_hash_table[h].tail_hash_offset = htole64(offset);
680
681         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
682                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
683
684         return 0;
685 }
686
687 static int journal_file_link_data(
688                 JournalFile *f,
689                 Object *o,
690                 uint64_t offset,
691                 uint64_t hash) {
692
693         uint64_t p, h;
694         int r;
695
696         assert(f);
697         assert(o);
698         assert(offset > 0);
699
700         if (o->object.type != OBJECT_DATA)
701                 return -EINVAL;
702
703         /* This might alter the window we are looking at */
704
705         o->data.next_hash_offset = o->data.next_field_offset = 0;
706         o->data.entry_offset = o->data.entry_array_offset = 0;
707         o->data.n_entries = 0;
708
709         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
710         p = le64toh(f->data_hash_table[h].tail_hash_offset);
711         if (p == 0)
712                 /* Only entry in the hash table is easy */
713                 f->data_hash_table[h].head_hash_offset = htole64(offset);
714         else {
715                 /* Move back to the previous data object, to patch in
716                  * pointer */
717
718                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
719                 if (r < 0)
720                         return r;
721
722                 o->data.next_hash_offset = htole64(offset);
723         }
724
725         f->data_hash_table[h].tail_hash_offset = htole64(offset);
726
727         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
728                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
729
730         return 0;
731 }
732
733 int journal_file_find_field_object_with_hash(
734                 JournalFile *f,
735                 const void *field, uint64_t size, uint64_t hash,
736                 Object **ret, uint64_t *offset) {
737
738         uint64_t p, osize, h;
739         int r;
740
741         assert(f);
742         assert(field && size > 0);
743
744         osize = offsetof(Object, field.payload) + size;
745
746         if (f->header->field_hash_table_size == 0)
747                 return -EBADMSG;
748
749         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
750         p = le64toh(f->field_hash_table[h].head_hash_offset);
751
752         while (p > 0) {
753                 Object *o;
754
755                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
756                 if (r < 0)
757                         return r;
758
759                 if (le64toh(o->field.hash) == hash &&
760                     le64toh(o->object.size) == osize &&
761                     memcmp(o->field.payload, field, size) == 0) {
762
763                         if (ret)
764                                 *ret = o;
765                         if (offset)
766                                 *offset = p;
767
768                         return 1;
769                 }
770
771                 p = le64toh(o->field.next_hash_offset);
772         }
773
774         return 0;
775 }
776
777 int journal_file_find_field_object(
778                 JournalFile *f,
779                 const void *field, uint64_t size,
780                 Object **ret, uint64_t *offset) {
781
782         uint64_t hash;
783
784         assert(f);
785         assert(field && size > 0);
786
787         hash = hash64(field, size);
788
789         return journal_file_find_field_object_with_hash(f,
790                                                         field, size, hash,
791                                                         ret, offset);
792 }
793
794 int journal_file_find_data_object_with_hash(
795                 JournalFile *f,
796                 const void *data, uint64_t size, uint64_t hash,
797                 Object **ret, uint64_t *offset) {
798
799         uint64_t p, osize, h;
800         int r;
801
802         assert(f);
803         assert(data || size == 0);
804
805         osize = offsetof(Object, data.payload) + size;
806
807         if (f->header->data_hash_table_size == 0)
808                 return -EBADMSG;
809
810         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
811         p = le64toh(f->data_hash_table[h].head_hash_offset);
812
813         while (p > 0) {
814                 Object *o;
815
816                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
817                 if (r < 0)
818                         return r;
819
820                 if (le64toh(o->data.hash) != hash)
821                         goto next;
822
823                 if (o->object.flags & OBJECT_COMPRESSED) {
824 #ifdef HAVE_XZ
825                         uint64_t l, rsize;
826
827                         l = le64toh(o->object.size);
828                         if (l <= offsetof(Object, data.payload))
829                                 return -EBADMSG;
830
831                         l -= offsetof(Object, data.payload);
832
833                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
834                                 return -EBADMSG;
835
836                         if (rsize == size &&
837                             memcmp(f->compress_buffer, data, size) == 0) {
838
839                                 if (ret)
840                                         *ret = o;
841
842                                 if (offset)
843                                         *offset = p;
844
845                                 return 1;
846                         }
847 #else
848                         return -EPROTONOSUPPORT;
849 #endif
850
851                 } else if (le64toh(o->object.size) == osize &&
852                            memcmp(o->data.payload, data, size) == 0) {
853
854                         if (ret)
855                                 *ret = o;
856
857                         if (offset)
858                                 *offset = p;
859
860                         return 1;
861                 }
862
863         next:
864                 p = le64toh(o->data.next_hash_offset);
865         }
866
867         return 0;
868 }
869
870 int journal_file_find_data_object(
871                 JournalFile *f,
872                 const void *data, uint64_t size,
873                 Object **ret, uint64_t *offset) {
874
875         uint64_t hash;
876
877         assert(f);
878         assert(data || size == 0);
879
880         hash = hash64(data, size);
881
882         return journal_file_find_data_object_with_hash(f,
883                                                        data, size, hash,
884                                                        ret, offset);
885 }
886
887 static int journal_file_append_field(
888                 JournalFile *f,
889                 const void *field, uint64_t size,
890                 Object **ret, uint64_t *offset) {
891
892         uint64_t hash, p;
893         uint64_t osize;
894         Object *o;
895         int r;
896
897         assert(f);
898         assert(field && size > 0);
899
900         hash = hash64(field, size);
901
902         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
903         if (r < 0)
904                 return r;
905         else if (r > 0) {
906
907                 if (ret)
908                         *ret = o;
909
910                 if (offset)
911                         *offset = p;
912
913                 return 0;
914         }
915
916         osize = offsetof(Object, field.payload) + size;
917         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
918         if (r < 0)
919                 return r;
920
921         o->field.hash = htole64(hash);
922         memcpy(o->field.payload, field, size);
923
924         r = journal_file_link_field(f, o, p, hash);
925         if (r < 0)
926                 return r;
927
928         /* The linking might have altered the window, so let's
929          * refresh our pointer */
930         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
931         if (r < 0)
932                 return r;
933
934 #ifdef HAVE_GCRYPT
935         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
936         if (r < 0)
937                 return r;
938 #endif
939
940         if (ret)
941                 *ret = o;
942
943         if (offset)
944                 *offset = p;
945
946         return 0;
947 }
948
949 static int journal_file_append_data(
950                 JournalFile *f,
951                 const void *data, uint64_t size,
952                 Object **ret, uint64_t *offset) {
953
954         uint64_t hash, p;
955         uint64_t osize;
956         Object *o;
957         int r;
958         bool compressed = false;
959         const void *eq;
960
961         assert(f);
962         assert(data || size == 0);
963
964         hash = hash64(data, size);
965
966         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
967         if (r < 0)
968                 return r;
969         else if (r > 0) {
970
971                 if (ret)
972                         *ret = o;
973
974                 if (offset)
975                         *offset = p;
976
977                 return 0;
978         }
979
980         osize = offsetof(Object, data.payload) + size;
981         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
982         if (r < 0)
983                 return r;
984
985         o->data.hash = htole64(hash);
986
987 #ifdef HAVE_XZ
988         if (f->compress &&
989             size >= COMPRESSION_SIZE_THRESHOLD) {
990                 uint64_t rsize;
991
992                 compressed = compress_blob(data, size, o->data.payload, &rsize);
993
994                 if (compressed) {
995                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
996                         o->object.flags |= OBJECT_COMPRESSED;
997
998                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
999                 }
1000         }
1001 #endif
1002
1003         if (!compressed && size > 0)
1004                 memcpy(o->data.payload, data, size);
1005
1006         r = journal_file_link_data(f, o, p, hash);
1007         if (r < 0)
1008                 return r;
1009
1010         /* The linking might have altered the window, so let's
1011          * refresh our pointer */
1012         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1013         if (r < 0)
1014                 return r;
1015
1016         eq = memchr(data, '=', size);
1017         if (eq && eq > data) {
1018                 uint64_t fp;
1019                 Object *fo;
1020
1021                 /* Create field object ... */
1022                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1023                 if (r < 0)
1024                         return r;
1025
1026                 /* ... and link it in. */
1027                 o->data.next_field_offset = fo->field.head_data_offset;
1028                 fo->field.head_data_offset = le64toh(p);
1029         }
1030
1031 #ifdef HAVE_GCRYPT
1032         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1033         if (r < 0)
1034                 return r;
1035 #endif
1036
1037         if (ret)
1038                 *ret = o;
1039
1040         if (offset)
1041                 *offset = p;
1042
1043         return 0;
1044 }
1045
1046 uint64_t journal_file_entry_n_items(Object *o) {
1047         assert(o);
1048
1049         if (o->object.type != OBJECT_ENTRY)
1050                 return 0;
1051
1052         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1053 }
1054
1055 uint64_t journal_file_entry_array_n_items(Object *o) {
1056         assert(o);
1057
1058         if (o->object.type != OBJECT_ENTRY_ARRAY)
1059                 return 0;
1060
1061         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1062 }
1063
1064 uint64_t journal_file_hash_table_n_items(Object *o) {
1065         assert(o);
1066
1067         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068             o->object.type != OBJECT_FIELD_HASH_TABLE)
1069                 return 0;
1070
1071         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1072 }
1073
1074 static int link_entry_into_array(JournalFile *f,
1075                                  le64_t *first,
1076                                  le64_t *idx,
1077                                  uint64_t p) {
1078         int r;
1079         uint64_t n = 0, ap = 0, q, i, a, hidx;
1080         Object *o;
1081
1082         assert(f);
1083         assert(first);
1084         assert(idx);
1085         assert(p > 0);
1086
1087         a = le64toh(*first);
1088         i = hidx = le64toh(*idx);
1089         while (a > 0) {
1090
1091                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1092                 if (r < 0)
1093                         return r;
1094
1095                 n = journal_file_entry_array_n_items(o);
1096                 if (i < n) {
1097                         o->entry_array.items[i] = htole64(p);
1098                         *idx = htole64(hidx + 1);
1099                         return 0;
1100                 }
1101
1102                 i -= n;
1103                 ap = a;
1104                 a = le64toh(o->entry_array.next_entry_array_offset);
1105         }
1106
1107         if (hidx > n)
1108                 n = (hidx+1) * 2;
1109         else
1110                 n = n * 2;
1111
1112         if (n < 4)
1113                 n = 4;
1114
1115         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1117                                        &o, &q);
1118         if (r < 0)
1119                 return r;
1120
1121 #ifdef HAVE_GCRYPT
1122         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1123         if (r < 0)
1124                 return r;
1125 #endif
1126
1127         o->entry_array.items[i] = htole64(p);
1128
1129         if (ap == 0)
1130                 *first = htole64(q);
1131         else {
1132                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1133                 if (r < 0)
1134                         return r;
1135
1136                 o->entry_array.next_entry_array_offset = htole64(q);
1137         }
1138
1139         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1141
1142         *idx = htole64(hidx + 1);
1143
1144         return 0;
1145 }
1146
1147 static int link_entry_into_array_plus_one(JournalFile *f,
1148                                           le64_t *extra,
1149                                           le64_t *first,
1150                                           le64_t *idx,
1151                                           uint64_t p) {
1152
1153         int r;
1154
1155         assert(f);
1156         assert(extra);
1157         assert(first);
1158         assert(idx);
1159         assert(p > 0);
1160
1161         if (*idx == 0)
1162                 *extra = htole64(p);
1163         else {
1164                 le64_t i;
1165
1166                 i = htole64(le64toh(*idx) - 1);
1167                 r = link_entry_into_array(f, first, &i, p);
1168                 if (r < 0)
1169                         return r;
1170         }
1171
1172         *idx = htole64(le64toh(*idx) + 1);
1173         return 0;
1174 }
1175
1176 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1177         uint64_t p;
1178         int r;
1179         assert(f);
1180         assert(o);
1181         assert(offset > 0);
1182
1183         p = le64toh(o->entry.items[i].object_offset);
1184         if (p == 0)
1185                 return -EINVAL;
1186
1187         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1188         if (r < 0)
1189                 return r;
1190
1191         return link_entry_into_array_plus_one(f,
1192                                               &o->data.entry_offset,
1193                                               &o->data.entry_array_offset,
1194                                               &o->data.n_entries,
1195                                               offset);
1196 }
1197
1198 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1199         uint64_t n, i;
1200         int r;
1201
1202         assert(f);
1203         assert(o);
1204         assert(offset > 0);
1205
1206         if (o->object.type != OBJECT_ENTRY)
1207                 return -EINVAL;
1208
1209         __sync_synchronize();
1210
1211         /* Link up the entry itself */
1212         r = link_entry_into_array(f,
1213                                   &f->header->entry_array_offset,
1214                                   &f->header->n_entries,
1215                                   offset);
1216         if (r < 0)
1217                 return r;
1218
1219         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1220
1221         if (f->header->head_entry_realtime == 0)
1222                 f->header->head_entry_realtime = o->entry.realtime;
1223
1224         f->header->tail_entry_realtime = o->entry.realtime;
1225         f->header->tail_entry_monotonic = o->entry.monotonic;
1226
1227         f->tail_entry_monotonic_valid = true;
1228
1229         /* Link up the items */
1230         n = journal_file_entry_n_items(o);
1231         for (i = 0; i < n; i++) {
1232                 r = journal_file_link_entry_item(f, o, offset, i);
1233                 if (r < 0)
1234                         return r;
1235         }
1236
1237         return 0;
1238 }
1239
1240 static int journal_file_append_entry_internal(
1241                 JournalFile *f,
1242                 const dual_timestamp *ts,
1243                 uint64_t xor_hash,
1244                 const EntryItem items[], unsigned n_items,
1245                 uint64_t *seqnum,
1246                 Object **ret, uint64_t *offset) {
1247         uint64_t np;
1248         uint64_t osize;
1249         Object *o;
1250         int r;
1251
1252         assert(f);
1253         assert(items || n_items == 0);
1254         assert(ts);
1255
1256         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1257
1258         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1259         if (r < 0)
1260                 return r;
1261
1262         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1263         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1264         o->entry.realtime = htole64(ts->realtime);
1265         o->entry.monotonic = htole64(ts->monotonic);
1266         o->entry.xor_hash = htole64(xor_hash);
1267         o->entry.boot_id = f->header->boot_id;
1268
1269 #ifdef HAVE_GCRYPT
1270         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1271         if (r < 0)
1272                 return r;
1273 #endif
1274
1275         r = journal_file_link_entry(f, o, np);
1276         if (r < 0)
1277                 return r;
1278
1279         if (ret)
1280                 *ret = o;
1281
1282         if (offset)
1283                 *offset = np;
1284
1285         return 0;
1286 }
1287
1288 void journal_file_post_change(JournalFile *f) {
1289         assert(f);
1290
1291         /* inotify() does not receive IN_MODIFY events from file
1292          * accesses done via mmap(). After each access we hence
1293          * trigger IN_MODIFY by truncating the journal file to its
1294          * current size which triggers IN_MODIFY. */
1295
1296         __sync_synchronize();
1297
1298         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1299                 log_error("Failed to truncate file to its own size: %m");
1300 }
1301
1302 static int entry_item_cmp(const void *_a, const void *_b) {
1303         const EntryItem *a = _a, *b = _b;
1304
1305         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1306                 return -1;
1307         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1308                 return 1;
1309         return 0;
1310 }
1311
1312 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1313         unsigned i;
1314         EntryItem *items;
1315         int r;
1316         uint64_t xor_hash = 0;
1317         struct dual_timestamp _ts;
1318
1319         assert(f);
1320         assert(iovec || n_iovec == 0);
1321
1322         if (!ts) {
1323                 dual_timestamp_get(&_ts);
1324                 ts = &_ts;
1325         }
1326
1327         if (f->tail_entry_monotonic_valid &&
1328             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1329                 return -EINVAL;
1330
1331 #ifdef HAVE_GCRYPT
1332         r = journal_file_maybe_append_tag(f, ts->realtime);
1333         if (r < 0)
1334                 return r;
1335 #endif
1336
1337         /* alloca() can't take 0, hence let's allocate at least one */
1338         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1339
1340         for (i = 0; i < n_iovec; i++) {
1341                 uint64_t p;
1342                 Object *o;
1343
1344                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1345                 if (r < 0)
1346                         return r;
1347
1348                 xor_hash ^= le64toh(o->data.hash);
1349                 items[i].object_offset = htole64(p);
1350                 items[i].hash = o->data.hash;
1351         }
1352
1353         /* Order by the position on disk, in order to improve seek
1354          * times for rotating media. */
1355         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1356
1357         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1358
1359         journal_file_post_change(f);
1360
1361         return r;
1362 }
1363
1364 typedef struct ChainCacheItem {
1365         uint64_t first; /* the array at the begin of the chain */
1366         uint64_t array; /* the cached array */
1367         uint64_t begin; /* the first item in the cached array */
1368         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1369 } ChainCacheItem;
1370
1371 static void chain_cache_put(
1372                 Hashmap *h,
1373                 ChainCacheItem *ci,
1374                 uint64_t first,
1375                 uint64_t array,
1376                 uint64_t begin,
1377                 uint64_t total) {
1378
1379         if (!ci) {
1380                 /* If the chain item to cache for this chain is the
1381                  * first one it's not worth caching anything */
1382                 if (array == first)
1383                         return;
1384
1385                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1386                         ci = hashmap_steal_first(h);
1387                 else {
1388                         ci = new(ChainCacheItem, 1);
1389                         if (!ci)
1390                                 return;
1391                 }
1392
1393                 ci->first = first;
1394
1395                 if (hashmap_put(h, &ci->first, ci) < 0) {
1396                         free(ci);
1397                         return;
1398                 }
1399         } else
1400                 assert(ci->first == first);
1401
1402         ci->array = array;
1403         ci->begin = begin;
1404         ci->total = total;
1405 }
1406
1407 static int generic_array_get(JournalFile *f,
1408                              uint64_t first,
1409                              uint64_t i,
1410                              Object **ret, uint64_t *offset) {
1411
1412         Object *o;
1413         uint64_t p = 0, a, t = 0;
1414         int r;
1415         ChainCacheItem *ci;
1416
1417         assert(f);
1418
1419         a = first;
1420
1421         /* Try the chain cache first */
1422         ci = hashmap_get(f->chain_cache, &first);
1423         if (ci && i > ci->total) {
1424                 a = ci->array;
1425                 i -= ci->total;
1426                 t = ci->total;
1427         }
1428
1429         while (a > 0) {
1430                 uint64_t k;
1431
1432                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1433                 if (r < 0)
1434                         return r;
1435
1436                 k = journal_file_entry_array_n_items(o);
1437                 if (i < k) {
1438                         p = le64toh(o->entry_array.items[i]);
1439                         goto found;
1440                 }
1441
1442                 i -= k;
1443                 t += k;
1444                 a = le64toh(o->entry_array.next_entry_array_offset);
1445         }
1446
1447         return 0;
1448
1449 found:
1450         /* Let's cache this item for the next invocation */
1451         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1452
1453         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1454         if (r < 0)
1455                 return r;
1456
1457         if (ret)
1458                 *ret = o;
1459
1460         if (offset)
1461                 *offset = p;
1462
1463         return 1;
1464 }
1465
1466 static int generic_array_get_plus_one(JournalFile *f,
1467                                       uint64_t extra,
1468                                       uint64_t first,
1469                                       uint64_t i,
1470                                       Object **ret, uint64_t *offset) {
1471
1472         Object *o;
1473
1474         assert(f);
1475
1476         if (i == 0) {
1477                 int r;
1478
1479                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1480                 if (r < 0)
1481                         return r;
1482
1483                 if (ret)
1484                         *ret = o;
1485
1486                 if (offset)
1487                         *offset = extra;
1488
1489                 return 1;
1490         }
1491
1492         return generic_array_get(f, first, i-1, ret, offset);
1493 }
1494
1495 enum {
1496         TEST_FOUND,
1497         TEST_LEFT,
1498         TEST_RIGHT
1499 };
1500
1501 static int generic_array_bisect(JournalFile *f,
1502                                 uint64_t first,
1503                                 uint64_t n,
1504                                 uint64_t needle,
1505                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1506                                 direction_t direction,
1507                                 Object **ret,
1508                                 uint64_t *offset,
1509                                 uint64_t *idx) {
1510
1511         uint64_t a, p, t = 0, i = 0, last_p = 0;
1512         bool subtract_one = false;
1513         Object *o, *array = NULL;
1514         int r;
1515         ChainCacheItem *ci;
1516
1517         assert(f);
1518         assert(test_object);
1519
1520         /* Start with the first array in the chain */
1521         a = first;
1522
1523         ci = hashmap_get(f->chain_cache, &first);
1524         if (ci && n > ci->total) {
1525                 /* Ah, we have iterated this bisection array chain
1526                  * previously! Let's see if we can skip ahead in the
1527                  * chain, as far as the last time. But we can't jump
1528                  * backwards in the chain, so let's check that
1529                  * first. */
1530
1531                 r = test_object(f, ci->begin, needle);
1532                 if (r < 0)
1533                         return r;
1534
1535                 if (r == TEST_LEFT) {
1536                         /* OK, what we are looking for is right of th
1537                          * begin of this EntryArray, so let's jump
1538                          * straight to previously cached array in the
1539                          * chain */
1540
1541                         a = ci->array;
1542                         n -= ci->total;
1543                         t = ci->total;
1544                 }
1545         }
1546
1547         while (a > 0) {
1548                 uint64_t left, right, k, lp;
1549
1550                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1551                 if (r < 0)
1552                         return r;
1553
1554                 k = journal_file_entry_array_n_items(array);
1555                 right = MIN(k, n);
1556                 if (right <= 0)
1557                         return 0;
1558
1559                 i = right - 1;
1560                 lp = p = le64toh(array->entry_array.items[i]);
1561                 if (p <= 0)
1562                         return -EBADMSG;
1563
1564                 r = test_object(f, p, needle);
1565                 if (r < 0)
1566                         return r;
1567
1568                 if (r == TEST_FOUND)
1569                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1570
1571                 if (r == TEST_RIGHT) {
1572                         left = 0;
1573                         right -= 1;
1574                         for (;;) {
1575                                 if (left == right) {
1576                                         if (direction == DIRECTION_UP)
1577                                                 subtract_one = true;
1578
1579                                         i = left;
1580                                         goto found;
1581                                 }
1582
1583                                 assert(left < right);
1584
1585                                 i = (left + right) / 2;
1586                                 p = le64toh(array->entry_array.items[i]);
1587                                 if (p <= 0)
1588                                         return -EBADMSG;
1589
1590                                 r = test_object(f, p, needle);
1591                                 if (r < 0)
1592                                         return r;
1593
1594                                 if (r == TEST_FOUND)
1595                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1596
1597                                 if (r == TEST_RIGHT)
1598                                         right = i;
1599                                 else
1600                                         left = i + 1;
1601                         }
1602                 }
1603
1604                 if (k > n) {
1605                         if (direction == DIRECTION_UP) {
1606                                 i = n;
1607                                 subtract_one = true;
1608                                 goto found;
1609                         }
1610
1611                         return 0;
1612                 }
1613
1614                 last_p = lp;
1615
1616                 n -= k;
1617                 t += k;
1618                 a = le64toh(array->entry_array.next_entry_array_offset);
1619         }
1620
1621         return 0;
1622
1623 found:
1624         if (subtract_one && t == 0 && i == 0)
1625                 return 0;
1626
1627         /* Let's cache this item for the next invocation */
1628         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1629
1630         if (subtract_one && i == 0)
1631                 p = last_p;
1632         else if (subtract_one)
1633                 p = le64toh(array->entry_array.items[i-1]);
1634         else
1635                 p = le64toh(array->entry_array.items[i]);
1636
1637         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1638         if (r < 0)
1639                 return r;
1640
1641         if (ret)
1642                 *ret = o;
1643
1644         if (offset)
1645                 *offset = p;
1646
1647         if (idx)
1648                 *idx = t + i + (subtract_one ? -1 : 0);
1649
1650         return 1;
1651 }
1652
1653 static int generic_array_bisect_plus_one(JournalFile *f,
1654                                          uint64_t extra,
1655                                          uint64_t first,
1656                                          uint64_t n,
1657                                          uint64_t needle,
1658                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1659                                          direction_t direction,
1660                                          Object **ret,
1661                                          uint64_t *offset,
1662                                          uint64_t *idx) {
1663
1664         int r;
1665         bool step_back = false;
1666         Object *o;
1667
1668         assert(f);
1669         assert(test_object);
1670
1671         if (n <= 0)
1672                 return 0;
1673
1674         /* This bisects the array in object 'first', but first checks
1675          * an extra  */
1676         r = test_object(f, extra, needle);
1677         if (r < 0)
1678                 return r;
1679
1680         if (r == TEST_FOUND)
1681                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1682
1683         /* if we are looking with DIRECTION_UP then we need to first
1684            see if in the actual array there is a matching entry, and
1685            return the last one of that. But if there isn't any we need
1686            to return this one. Hence remember this, and return it
1687            below. */
1688         if (r == TEST_LEFT)
1689                 step_back = direction == DIRECTION_UP;
1690
1691         if (r == TEST_RIGHT) {
1692                 if (direction == DIRECTION_DOWN)
1693                         goto found;
1694                 else
1695                         return 0;
1696         }
1697
1698         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1699
1700         if (r == 0 && step_back)
1701                 goto found;
1702
1703         if (r > 0 && idx)
1704                 (*idx) ++;
1705
1706         return r;
1707
1708 found:
1709         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1710         if (r < 0)
1711                 return r;
1712
1713         if (ret)
1714                 *ret = o;
1715
1716         if (offset)
1717                 *offset = extra;
1718
1719         if (idx)
1720                 *idx = 0;
1721
1722         return 1;
1723 }
1724
1725 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1726         assert(f);
1727         assert(p > 0);
1728
1729         if (p == needle)
1730                 return TEST_FOUND;
1731         else if (p < needle)
1732                 return TEST_LEFT;
1733         else
1734                 return TEST_RIGHT;
1735 }
1736
1737 int journal_file_move_to_entry_by_offset(
1738                 JournalFile *f,
1739                 uint64_t p,
1740                 direction_t direction,
1741                 Object **ret,
1742                 uint64_t *offset) {
1743
1744         return generic_array_bisect(f,
1745                                     le64toh(f->header->entry_array_offset),
1746                                     le64toh(f->header->n_entries),
1747                                     p,
1748                                     test_object_offset,
1749                                     direction,
1750                                     ret, offset, NULL);
1751 }
1752
1753
1754 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1755         Object *o;
1756         int r;
1757
1758         assert(f);
1759         assert(p > 0);
1760
1761         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1762         if (r < 0)
1763                 return r;
1764
1765         if (le64toh(o->entry.seqnum) == needle)
1766                 return TEST_FOUND;
1767         else if (le64toh(o->entry.seqnum) < needle)
1768                 return TEST_LEFT;
1769         else
1770                 return TEST_RIGHT;
1771 }
1772
1773 int journal_file_move_to_entry_by_seqnum(
1774                 JournalFile *f,
1775                 uint64_t seqnum,
1776                 direction_t direction,
1777                 Object **ret,
1778                 uint64_t *offset) {
1779
1780         return generic_array_bisect(f,
1781                                     le64toh(f->header->entry_array_offset),
1782                                     le64toh(f->header->n_entries),
1783                                     seqnum,
1784                                     test_object_seqnum,
1785                                     direction,
1786                                     ret, offset, NULL);
1787 }
1788
1789 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1790         Object *o;
1791         int r;
1792
1793         assert(f);
1794         assert(p > 0);
1795
1796         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1797         if (r < 0)
1798                 return r;
1799
1800         if (le64toh(o->entry.realtime) == needle)
1801                 return TEST_FOUND;
1802         else if (le64toh(o->entry.realtime) < needle)
1803                 return TEST_LEFT;
1804         else
1805                 return TEST_RIGHT;
1806 }
1807
1808 int journal_file_move_to_entry_by_realtime(
1809                 JournalFile *f,
1810                 uint64_t realtime,
1811                 direction_t direction,
1812                 Object **ret,
1813                 uint64_t *offset) {
1814
1815         return generic_array_bisect(f,
1816                                     le64toh(f->header->entry_array_offset),
1817                                     le64toh(f->header->n_entries),
1818                                     realtime,
1819                                     test_object_realtime,
1820                                     direction,
1821                                     ret, offset, NULL);
1822 }
1823
1824 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1825         Object *o;
1826         int r;
1827
1828         assert(f);
1829         assert(p > 0);
1830
1831         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1832         if (r < 0)
1833                 return r;
1834
1835         if (le64toh(o->entry.monotonic) == needle)
1836                 return TEST_FOUND;
1837         else if (le64toh(o->entry.monotonic) < needle)
1838                 return TEST_LEFT;
1839         else
1840                 return TEST_RIGHT;
1841 }
1842
1843 static inline int find_data_object_by_boot_id(
1844                 JournalFile *f,
1845                 sd_id128_t boot_id,
1846                 Object **o,
1847                 uint64_t *b) {
1848         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1849
1850         sd_id128_to_string(boot_id, t + 9);
1851         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1852 }
1853
1854 int journal_file_move_to_entry_by_monotonic(
1855                 JournalFile *f,
1856                 sd_id128_t boot_id,
1857                 uint64_t monotonic,
1858                 direction_t direction,
1859                 Object **ret,
1860                 uint64_t *offset) {
1861
1862         Object *o;
1863         int r;
1864
1865         assert(f);
1866
1867         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1868         if (r < 0)
1869                 return r;
1870         if (r == 0)
1871                 return -ENOENT;
1872
1873         return generic_array_bisect_plus_one(f,
1874                                              le64toh(o->data.entry_offset),
1875                                              le64toh(o->data.entry_array_offset),
1876                                              le64toh(o->data.n_entries),
1877                                              monotonic,
1878                                              test_object_monotonic,
1879                                              direction,
1880                                              ret, offset, NULL);
1881 }
1882
1883 int journal_file_next_entry(
1884                 JournalFile *f,
1885                 Object *o, uint64_t p,
1886                 direction_t direction,
1887                 Object **ret, uint64_t *offset) {
1888
1889         uint64_t i, n;
1890         int r;
1891
1892         assert(f);
1893         assert(p > 0 || !o);
1894
1895         n = le64toh(f->header->n_entries);
1896         if (n <= 0)
1897                 return 0;
1898
1899         if (!o)
1900                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1901         else {
1902                 if (o->object.type != OBJECT_ENTRY)
1903                         return -EINVAL;
1904
1905                 r = generic_array_bisect(f,
1906                                          le64toh(f->header->entry_array_offset),
1907                                          le64toh(f->header->n_entries),
1908                                          p,
1909                                          test_object_offset,
1910                                          DIRECTION_DOWN,
1911                                          NULL, NULL,
1912                                          &i);
1913                 if (r <= 0)
1914                         return r;
1915
1916                 if (direction == DIRECTION_DOWN) {
1917                         if (i >= n - 1)
1918                                 return 0;
1919
1920                         i++;
1921                 } else {
1922                         if (i <= 0)
1923                                 return 0;
1924
1925                         i--;
1926                 }
1927         }
1928
1929         /* And jump to it */
1930         return generic_array_get(f,
1931                                  le64toh(f->header->entry_array_offset),
1932                                  i,
1933                                  ret, offset);
1934 }
1935
1936 int journal_file_skip_entry(
1937                 JournalFile *f,
1938                 Object *o, uint64_t p,
1939                 int64_t skip,
1940                 Object **ret, uint64_t *offset) {
1941
1942         uint64_t i, n;
1943         int r;
1944
1945         assert(f);
1946         assert(o);
1947         assert(p > 0);
1948
1949         if (o->object.type != OBJECT_ENTRY)
1950                 return -EINVAL;
1951
1952         r = generic_array_bisect(f,
1953                                  le64toh(f->header->entry_array_offset),
1954                                  le64toh(f->header->n_entries),
1955                                  p,
1956                                  test_object_offset,
1957                                  DIRECTION_DOWN,
1958                                  NULL, NULL,
1959                                  &i);
1960         if (r <= 0)
1961                 return r;
1962
1963         /* Calculate new index */
1964         if (skip < 0) {
1965                 if ((uint64_t) -skip >= i)
1966                         i = 0;
1967                 else
1968                         i = i - (uint64_t) -skip;
1969         } else
1970                 i  += (uint64_t) skip;
1971
1972         n = le64toh(f->header->n_entries);
1973         if (n <= 0)
1974                 return -EBADMSG;
1975
1976         if (i >= n)
1977                 i = n-1;
1978
1979         return generic_array_get(f,
1980                                  le64toh(f->header->entry_array_offset),
1981                                  i,
1982                                  ret, offset);
1983 }
1984
1985 int journal_file_next_entry_for_data(
1986                 JournalFile *f,
1987                 Object *o, uint64_t p,
1988                 uint64_t data_offset,
1989                 direction_t direction,
1990                 Object **ret, uint64_t *offset) {
1991
1992         uint64_t n, i;
1993         int r;
1994         Object *d;
1995
1996         assert(f);
1997         assert(p > 0 || !o);
1998
1999         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2000         if (r < 0)
2001                 return r;
2002
2003         n = le64toh(d->data.n_entries);
2004         if (n <= 0)
2005                 return n;
2006
2007         if (!o)
2008                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2009         else {
2010                 if (o->object.type != OBJECT_ENTRY)
2011                         return -EINVAL;
2012
2013                 r = generic_array_bisect_plus_one(f,
2014                                                   le64toh(d->data.entry_offset),
2015                                                   le64toh(d->data.entry_array_offset),
2016                                                   le64toh(d->data.n_entries),
2017                                                   p,
2018                                                   test_object_offset,
2019                                                   DIRECTION_DOWN,
2020                                                   NULL, NULL,
2021                                                   &i);
2022
2023                 if (r <= 0)
2024                         return r;
2025
2026                 if (direction == DIRECTION_DOWN) {
2027                         if (i >= n - 1)
2028                                 return 0;
2029
2030                         i++;
2031                 } else {
2032                         if (i <= 0)
2033                                 return 0;
2034
2035                         i--;
2036                 }
2037
2038         }
2039
2040         return generic_array_get_plus_one(f,
2041                                           le64toh(d->data.entry_offset),
2042                                           le64toh(d->data.entry_array_offset),
2043                                           i,
2044                                           ret, offset);
2045 }
2046
2047 int journal_file_move_to_entry_by_offset_for_data(
2048                 JournalFile *f,
2049                 uint64_t data_offset,
2050                 uint64_t p,
2051                 direction_t direction,
2052                 Object **ret, uint64_t *offset) {
2053
2054         int r;
2055         Object *d;
2056
2057         assert(f);
2058
2059         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2060         if (r < 0)
2061                 return r;
2062
2063         return generic_array_bisect_plus_one(f,
2064                                              le64toh(d->data.entry_offset),
2065                                              le64toh(d->data.entry_array_offset),
2066                                              le64toh(d->data.n_entries),
2067                                              p,
2068                                              test_object_offset,
2069                                              direction,
2070                                              ret, offset, NULL);
2071 }
2072
2073 int journal_file_move_to_entry_by_monotonic_for_data(
2074                 JournalFile *f,
2075                 uint64_t data_offset,
2076                 sd_id128_t boot_id,
2077                 uint64_t monotonic,
2078                 direction_t direction,
2079                 Object **ret, uint64_t *offset) {
2080
2081         Object *o, *d;
2082         int r;
2083         uint64_t b, z;
2084
2085         assert(f);
2086
2087         /* First, seek by time */
2088         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2089         if (r < 0)
2090                 return r;
2091         if (r == 0)
2092                 return -ENOENT;
2093
2094         r = generic_array_bisect_plus_one(f,
2095                                           le64toh(o->data.entry_offset),
2096                                           le64toh(o->data.entry_array_offset),
2097                                           le64toh(o->data.n_entries),
2098                                           monotonic,
2099                                           test_object_monotonic,
2100                                           direction,
2101                                           NULL, &z, NULL);
2102         if (r <= 0)
2103                 return r;
2104
2105         /* And now, continue seeking until we find an entry that
2106          * exists in both bisection arrays */
2107
2108         for (;;) {
2109                 Object *qo;
2110                 uint64_t p, q;
2111
2112                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2113                 if (r < 0)
2114                         return r;
2115
2116                 r = generic_array_bisect_plus_one(f,
2117                                                   le64toh(d->data.entry_offset),
2118                                                   le64toh(d->data.entry_array_offset),
2119                                                   le64toh(d->data.n_entries),
2120                                                   z,
2121                                                   test_object_offset,
2122                                                   direction,
2123                                                   NULL, &p, NULL);
2124                 if (r <= 0)
2125                         return r;
2126
2127                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2128                 if (r < 0)
2129                         return r;
2130
2131                 r = generic_array_bisect_plus_one(f,
2132                                                   le64toh(o->data.entry_offset),
2133                                                   le64toh(o->data.entry_array_offset),
2134                                                   le64toh(o->data.n_entries),
2135                                                   p,
2136                                                   test_object_offset,
2137                                                   direction,
2138                                                   &qo, &q, NULL);
2139
2140                 if (r <= 0)
2141                         return r;
2142
2143                 if (p == q) {
2144                         if (ret)
2145                                 *ret = qo;
2146                         if (offset)
2147                                 *offset = q;
2148
2149                         return 1;
2150                 }
2151
2152                 z = q;
2153         }
2154
2155         return 0;
2156 }
2157
2158 int journal_file_move_to_entry_by_seqnum_for_data(
2159                 JournalFile *f,
2160                 uint64_t data_offset,
2161                 uint64_t seqnum,
2162                 direction_t direction,
2163                 Object **ret, uint64_t *offset) {
2164
2165         Object *d;
2166         int r;
2167
2168         assert(f);
2169
2170         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2171         if (r < 0)
2172                 return r;
2173
2174         return generic_array_bisect_plus_one(f,
2175                                              le64toh(d->data.entry_offset),
2176                                              le64toh(d->data.entry_array_offset),
2177                                              le64toh(d->data.n_entries),
2178                                              seqnum,
2179                                              test_object_seqnum,
2180                                              direction,
2181                                              ret, offset, NULL);
2182 }
2183
2184 int journal_file_move_to_entry_by_realtime_for_data(
2185                 JournalFile *f,
2186                 uint64_t data_offset,
2187                 uint64_t realtime,
2188                 direction_t direction,
2189                 Object **ret, uint64_t *offset) {
2190
2191         Object *d;
2192         int r;
2193
2194         assert(f);
2195
2196         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2197         if (r < 0)
2198                 return r;
2199
2200         return generic_array_bisect_plus_one(f,
2201                                              le64toh(d->data.entry_offset),
2202                                              le64toh(d->data.entry_array_offset),
2203                                              le64toh(d->data.n_entries),
2204                                              realtime,
2205                                              test_object_realtime,
2206                                              direction,
2207                                              ret, offset, NULL);
2208 }
2209
2210 void journal_file_dump(JournalFile *f) {
2211         Object *o;
2212         int r;
2213         uint64_t p;
2214
2215         assert(f);
2216
2217         journal_file_print_header(f);
2218
2219         p = le64toh(f->header->header_size);
2220         while (p != 0) {
2221                 r = journal_file_move_to_object(f, -1, p, &o);
2222                 if (r < 0)
2223                         goto fail;
2224
2225                 switch (o->object.type) {
2226
2227                 case OBJECT_UNUSED:
2228                         printf("Type: OBJECT_UNUSED\n");
2229                         break;
2230
2231                 case OBJECT_DATA:
2232                         printf("Type: OBJECT_DATA\n");
2233                         break;
2234
2235                 case OBJECT_FIELD:
2236                         printf("Type: OBJECT_FIELD\n");
2237                         break;
2238
2239                 case OBJECT_ENTRY:
2240                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2241                                le64toh(o->entry.seqnum),
2242                                le64toh(o->entry.monotonic),
2243                                le64toh(o->entry.realtime));
2244                         break;
2245
2246                 case OBJECT_FIELD_HASH_TABLE:
2247                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2248                         break;
2249
2250                 case OBJECT_DATA_HASH_TABLE:
2251                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2252                         break;
2253
2254                 case OBJECT_ENTRY_ARRAY:
2255                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2256                         break;
2257
2258                 case OBJECT_TAG:
2259                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2260                                le64toh(o->tag.seqnum),
2261                                le64toh(o->tag.epoch));
2262                         break;
2263
2264                 default:
2265                         printf("Type: unknown (%u)\n", o->object.type);
2266                         break;
2267                 }
2268
2269                 if (o->object.flags & OBJECT_COMPRESSED)
2270                         printf("Flags: COMPRESSED\n");
2271
2272                 if (p == le64toh(f->header->tail_object_offset))
2273                         p = 0;
2274                 else
2275                         p = p + ALIGN64(le64toh(o->object.size));
2276         }
2277
2278         return;
2279 fail:
2280         log_error("File corrupt");
2281 }
2282
2283 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2284         const char *x;
2285
2286         x = format_timestamp(buf, l, t);
2287         if (x)
2288                 return x;
2289         return " --- ";
2290 }
2291
2292 void journal_file_print_header(JournalFile *f) {
2293         char a[33], b[33], c[33], d[33];
2294         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2295         struct stat st;
2296         char bytes[FORMAT_BYTES_MAX];
2297
2298         assert(f);
2299
2300         printf("File Path: %s\n"
2301                "File ID: %s\n"
2302                "Machine ID: %s\n"
2303                "Boot ID: %s\n"
2304                "Sequential Number ID: %s\n"
2305                "State: %s\n"
2306                "Compatible Flags:%s%s\n"
2307                "Incompatible Flags:%s%s\n"
2308                "Header size: %"PRIu64"\n"
2309                "Arena size: %"PRIu64"\n"
2310                "Data Hash Table Size: %"PRIu64"\n"
2311                "Field Hash Table Size: %"PRIu64"\n"
2312                "Rotate Suggested: %s\n"
2313                "Head Sequential Number: %"PRIu64"\n"
2314                "Tail Sequential Number: %"PRIu64"\n"
2315                "Head Realtime Timestamp: %s\n"
2316                "Tail Realtime Timestamp: %s\n"
2317                "Tail Monotonic Timestamp: %s\n"
2318                "Objects: %"PRIu64"\n"
2319                "Entry Objects: %"PRIu64"\n",
2320                f->path,
2321                sd_id128_to_string(f->header->file_id, a),
2322                sd_id128_to_string(f->header->machine_id, b),
2323                sd_id128_to_string(f->header->boot_id, c),
2324                sd_id128_to_string(f->header->seqnum_id, d),
2325                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2326                f->header->state == STATE_ONLINE ? "ONLINE" :
2327                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2328                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2329                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2330                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2331                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2332                le64toh(f->header->header_size),
2333                le64toh(f->header->arena_size),
2334                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2335                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2336                yes_no(journal_file_rotate_suggested(f, 0)),
2337                le64toh(f->header->head_entry_seqnum),
2338                le64toh(f->header->tail_entry_seqnum),
2339                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2340                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2341                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2342                le64toh(f->header->n_objects),
2343                le64toh(f->header->n_entries));
2344
2345         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2346                 printf("Data Objects: %"PRIu64"\n"
2347                        "Data Hash Table Fill: %.1f%%\n",
2348                        le64toh(f->header->n_data),
2349                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2350
2351         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2352                 printf("Field Objects: %"PRIu64"\n"
2353                        "Field Hash Table Fill: %.1f%%\n",
2354                        le64toh(f->header->n_fields),
2355                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2356
2357         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2358                 printf("Tag Objects: %"PRIu64"\n",
2359                        le64toh(f->header->n_tags));
2360         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2361                 printf("Entry Array Objects: %"PRIu64"\n",
2362                        le64toh(f->header->n_entry_arrays));
2363
2364         if (fstat(f->fd, &st) >= 0)
2365                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2366 }
2367
2368 int journal_file_open(
2369                 const char *fname,
2370                 int flags,
2371                 mode_t mode,
2372                 bool compress,
2373                 bool seal,
2374                 JournalMetrics *metrics,
2375                 MMapCache *mmap_cache,
2376                 JournalFile *template,
2377                 JournalFile **ret) {
2378
2379         JournalFile *f;
2380         int r;
2381         bool newly_created = false;
2382
2383         assert(fname);
2384         assert(ret);
2385
2386         if ((flags & O_ACCMODE) != O_RDONLY &&
2387             (flags & O_ACCMODE) != O_RDWR)
2388                 return -EINVAL;
2389
2390         if (!endswith(fname, ".journal") &&
2391             !endswith(fname, ".journal~"))
2392                 return -EINVAL;
2393
2394         f = new0(JournalFile, 1);
2395         if (!f)
2396                 return -ENOMEM;
2397
2398         f->fd = -1;
2399         f->mode = mode;
2400
2401         f->flags = flags;
2402         f->prot = prot_from_flags(flags);
2403         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2404 #ifdef HAVE_XZ
2405         f->compress = compress;
2406 #endif
2407 #ifdef HAVE_GCRYPT
2408         f->seal = seal;
2409 #endif
2410
2411         if (mmap_cache)
2412                 f->mmap = mmap_cache_ref(mmap_cache);
2413         else {
2414                 f->mmap = mmap_cache_new();
2415                 if (!f->mmap) {
2416                         r = -ENOMEM;
2417                         goto fail;
2418                 }
2419         }
2420
2421         f->path = strdup(fname);
2422         if (!f->path) {
2423                 r = -ENOMEM;
2424                 goto fail;
2425         }
2426
2427         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2428         if (!f->chain_cache) {
2429                 r = -ENOMEM;
2430                 goto fail;
2431         }
2432
2433         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2434         if (f->fd < 0) {
2435                 r = -errno;
2436                 goto fail;
2437         }
2438
2439         if (fstat(f->fd, &f->last_stat) < 0) {
2440                 r = -errno;
2441                 goto fail;
2442         }
2443
2444         if (f->last_stat.st_size == 0 && f->writable) {
2445 #ifdef HAVE_XATTR
2446                 uint64_t crtime;
2447
2448                 /* Let's attach the creation time to the journal file,
2449                  * so that the vacuuming code knows the age of this
2450                  * file even if the file might end up corrupted one
2451                  * day... Ideally we'd just use the creation time many
2452                  * file systems maintain for each file, but there is
2453                  * currently no usable API to query this, hence let's
2454                  * emulate this via extended attributes. If extended
2455                  * attributes are not supported we'll just skip this,
2456                  * and rely solely on mtime/atime/ctime of the file.*/
2457
2458                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2459                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2460 #endif
2461
2462 #ifdef HAVE_GCRYPT
2463                 /* Try to load the FSPRG state, and if we can't, then
2464                  * just don't do sealing */
2465                 if (f->seal) {
2466                         r = journal_file_fss_load(f);
2467                         if (r < 0)
2468                                 f->seal = false;
2469                 }
2470 #endif
2471
2472                 r = journal_file_init_header(f, template);
2473                 if (r < 0)
2474                         goto fail;
2475
2476                 if (fstat(f->fd, &f->last_stat) < 0) {
2477                         r = -errno;
2478                         goto fail;
2479                 }
2480
2481                 newly_created = true;
2482         }
2483
2484         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2485                 r = -EIO;
2486                 goto fail;
2487         }
2488
2489         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2490         if (f->header == MAP_FAILED) {
2491                 f->header = NULL;
2492                 r = -errno;
2493                 goto fail;
2494         }
2495
2496         if (!newly_created) {
2497                 r = journal_file_verify_header(f);
2498                 if (r < 0)
2499                         goto fail;
2500         }
2501
2502 #ifdef HAVE_GCRYPT
2503         if (!newly_created && f->writable) {
2504                 r = journal_file_fss_load(f);
2505                 if (r < 0)
2506                         goto fail;
2507         }
2508 #endif
2509
2510         if (f->writable) {
2511                 if (metrics) {
2512                         journal_default_metrics(metrics, f->fd);
2513                         f->metrics = *metrics;
2514                 } else if (template)
2515                         f->metrics = template->metrics;
2516
2517                 r = journal_file_refresh_header(f);
2518                 if (r < 0)
2519                         goto fail;
2520         }
2521
2522 #ifdef HAVE_GCRYPT
2523         r = journal_file_hmac_setup(f);
2524         if (r < 0)
2525                 goto fail;
2526 #endif
2527
2528         if (newly_created) {
2529                 r = journal_file_setup_field_hash_table(f);
2530                 if (r < 0)
2531                         goto fail;
2532
2533                 r = journal_file_setup_data_hash_table(f);
2534                 if (r < 0)
2535                         goto fail;
2536
2537 #ifdef HAVE_GCRYPT
2538                 r = journal_file_append_first_tag(f);
2539                 if (r < 0)
2540                         goto fail;
2541 #endif
2542         }
2543
2544         r = journal_file_map_field_hash_table(f);
2545         if (r < 0)
2546                 goto fail;
2547
2548         r = journal_file_map_data_hash_table(f);
2549         if (r < 0)
2550                 goto fail;
2551
2552         *ret = f;
2553         return 0;
2554
2555 fail:
2556         journal_file_close(f);
2557
2558         return r;
2559 }
2560
2561 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2562         _cleanup_free_ char *p = NULL;
2563         size_t l;
2564         JournalFile *old_file, *new_file = NULL;
2565         int r;
2566
2567         assert(f);
2568         assert(*f);
2569
2570         old_file = *f;
2571
2572         if (!old_file->writable)
2573                 return -EINVAL;
2574
2575         if (!endswith(old_file->path, ".journal"))
2576                 return -EINVAL;
2577
2578         l = strlen(old_file->path);
2579         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2580                      (int) l - 8, old_file->path,
2581                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2582                      le64toh((*f)->header->head_entry_seqnum),
2583                      le64toh((*f)->header->head_entry_realtime));
2584         if (r < 0)
2585                 return -ENOMEM;
2586
2587         r = rename(old_file->path, p);
2588         if (r < 0)
2589                 return -errno;
2590
2591         old_file->header->state = STATE_ARCHIVED;
2592
2593         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2594         journal_file_close(old_file);
2595
2596         *f = new_file;
2597         return r;
2598 }
2599
2600 int journal_file_open_reliably(
2601                 const char *fname,
2602                 int flags,
2603                 mode_t mode,
2604                 bool compress,
2605                 bool seal,
2606                 JournalMetrics *metrics,
2607                 MMapCache *mmap_cache,
2608                 JournalFile *template,
2609                 JournalFile **ret) {
2610
2611         int r;
2612         size_t l;
2613         _cleanup_free_ char *p = NULL;
2614
2615         r = journal_file_open(fname, flags, mode, compress, seal,
2616                               metrics, mmap_cache, template, ret);
2617         if (r != -EBADMSG && /* corrupted */
2618             r != -ENODATA && /* truncated */
2619             r != -EHOSTDOWN && /* other machine */
2620             r != -EPROTONOSUPPORT && /* incompatible feature */
2621             r != -EBUSY && /* unclean shutdown */
2622             r != -ESHUTDOWN /* already archived */)
2623                 return r;
2624
2625         if ((flags & O_ACCMODE) == O_RDONLY)
2626                 return r;
2627
2628         if (!(flags & O_CREAT))
2629                 return r;
2630
2631         if (!endswith(fname, ".journal"))
2632                 return r;
2633
2634         /* The file is corrupted. Rotate it away and try it again (but only once) */
2635
2636         l = strlen(fname);
2637         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2638                      (int) l - 8, fname,
2639                      (unsigned long long) now(CLOCK_REALTIME),
2640                      random_ull()) < 0)
2641                 return -ENOMEM;
2642
2643         r = rename(fname, p);
2644         if (r < 0)
2645                 return -errno;
2646
2647         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2648
2649         return journal_file_open(fname, flags, mode, compress, seal,
2650                                  metrics, mmap_cache, template, ret);
2651 }
2652
2653 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2654         uint64_t i, n;
2655         uint64_t q, xor_hash = 0;
2656         int r;
2657         EntryItem *items;
2658         dual_timestamp ts;
2659
2660         assert(from);
2661         assert(to);
2662         assert(o);
2663         assert(p);
2664
2665         if (!to->writable)
2666                 return -EPERM;
2667
2668         ts.monotonic = le64toh(o->entry.monotonic);
2669         ts.realtime = le64toh(o->entry.realtime);
2670
2671         if (to->tail_entry_monotonic_valid &&
2672             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2673                 return -EINVAL;
2674
2675         n = journal_file_entry_n_items(o);
2676         items = alloca(sizeof(EntryItem) * n);
2677
2678         for (i = 0; i < n; i++) {
2679                 uint64_t l, h;
2680                 le64_t le_hash;
2681                 size_t t;
2682                 void *data;
2683                 Object *u;
2684
2685                 q = le64toh(o->entry.items[i].object_offset);
2686                 le_hash = o->entry.items[i].hash;
2687
2688                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2689                 if (r < 0)
2690                         return r;
2691
2692                 if (le_hash != o->data.hash)
2693                         return -EBADMSG;
2694
2695                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2696                 t = (size_t) l;
2697
2698                 /* We hit the limit on 32bit machines */
2699                 if ((uint64_t) t != l)
2700                         return -E2BIG;
2701
2702                 if (o->object.flags & OBJECT_COMPRESSED) {
2703 #ifdef HAVE_XZ
2704                         uint64_t rsize;
2705
2706                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2707                                 return -EBADMSG;
2708
2709                         data = from->compress_buffer;
2710                         l = rsize;
2711 #else
2712                         return -EPROTONOSUPPORT;
2713 #endif
2714                 } else
2715                         data = o->data.payload;
2716
2717                 r = journal_file_append_data(to, data, l, &u, &h);
2718                 if (r < 0)
2719                         return r;
2720
2721                 xor_hash ^= le64toh(u->data.hash);
2722                 items[i].object_offset = htole64(h);
2723                 items[i].hash = u->data.hash;
2724
2725                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2726                 if (r < 0)
2727                         return r;
2728         }
2729
2730         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2731 }
2732
2733 void journal_default_metrics(JournalMetrics *m, int fd) {
2734         uint64_t fs_size = 0;
2735         struct statvfs ss;
2736         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2737
2738         assert(m);
2739         assert(fd >= 0);
2740
2741         if (fstatvfs(fd, &ss) >= 0)
2742                 fs_size = ss.f_frsize * ss.f_blocks;
2743
2744         if (m->max_use == (uint64_t) -1) {
2745
2746                 if (fs_size > 0) {
2747                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2748
2749                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2750                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2751
2752                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2753                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2754                 } else
2755                         m->max_use = DEFAULT_MAX_USE_LOWER;
2756         } else {
2757                 m->max_use = PAGE_ALIGN(m->max_use);
2758
2759                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2760                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2761         }
2762
2763         if (m->max_size == (uint64_t) -1) {
2764                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2765
2766                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2767                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2768         } else
2769                 m->max_size = PAGE_ALIGN(m->max_size);
2770
2771         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2772                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2773
2774         if (m->max_size*2 > m->max_use)
2775                 m->max_use = m->max_size*2;
2776
2777         if (m->min_size == (uint64_t) -1)
2778                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2779         else {
2780                 m->min_size = PAGE_ALIGN(m->min_size);
2781
2782                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2783                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2784
2785                 if (m->min_size > m->max_size)
2786                         m->max_size = m->min_size;
2787         }
2788
2789         if (m->keep_free == (uint64_t) -1) {
2790
2791                 if (fs_size > 0) {
2792                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2793
2794                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2795                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2796
2797                 } else
2798                         m->keep_free = DEFAULT_KEEP_FREE;
2799         }
2800
2801         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2802                   format_bytes(a, sizeof(a), m->max_use),
2803                   format_bytes(b, sizeof(b), m->max_size),
2804                   format_bytes(c, sizeof(c), m->min_size),
2805                   format_bytes(d, sizeof(d), m->keep_free));
2806 }
2807
2808 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2809         assert(f);
2810         assert(from || to);
2811
2812         if (from) {
2813                 if (f->header->head_entry_realtime == 0)
2814                         return -ENOENT;
2815
2816                 *from = le64toh(f->header->head_entry_realtime);
2817         }
2818
2819         if (to) {
2820                 if (f->header->tail_entry_realtime == 0)
2821                         return -ENOENT;
2822
2823                 *to = le64toh(f->header->tail_entry_realtime);
2824         }
2825
2826         return 1;
2827 }
2828
2829 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2830         Object *o;
2831         uint64_t p;
2832         int r;
2833
2834         assert(f);
2835         assert(from || to);
2836
2837         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2838         if (r <= 0)
2839                 return r;
2840
2841         if (le64toh(o->data.n_entries) <= 0)
2842                 return 0;
2843
2844         if (from) {
2845                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2846                 if (r < 0)
2847                         return r;
2848
2849                 *from = le64toh(o->entry.monotonic);
2850         }
2851
2852         if (to) {
2853                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2854                 if (r < 0)
2855                         return r;
2856
2857                 r = generic_array_get_plus_one(f,
2858                                                le64toh(o->data.entry_offset),
2859                                                le64toh(o->data.entry_array_offset),
2860                                                le64toh(o->data.n_entries)-1,
2861                                                &o, NULL);
2862                 if (r <= 0)
2863                         return r;
2864
2865                 *to = le64toh(o->entry.monotonic);
2866         }
2867
2868         return 1;
2869 }
2870
2871 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2872         assert(f);
2873
2874         /* If we gained new header fields we gained new features,
2875          * hence suggest a rotation */
2876         if (le64toh(f->header->header_size) < sizeof(Header)) {
2877                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2878                 return true;
2879         }
2880
2881         /* Let's check if the hash tables grew over a certain fill
2882          * level (75%, borrowing this value from Java's hash table
2883          * implementation), and if so suggest a rotation. To calculate
2884          * the fill level we need the n_data field, which only exists
2885          * in newer versions. */
2886
2887         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2888                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2889                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2890                                   f->path,
2891                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2892                                   le64toh(f->header->n_data),
2893                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2894                                   (unsigned long long) f->last_stat.st_size,
2895                                   f->last_stat.st_size / le64toh(f->header->n_data));
2896                         return true;
2897                 }
2898
2899         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2900                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2901                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2902                                   f->path,
2903                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2904                                   le64toh(f->header->n_fields),
2905                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2906                         return true;
2907                 }
2908
2909         /* Are the data objects properly indexed by field objects? */
2910         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2911             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2912             le64toh(f->header->n_data) > 0 &&
2913             le64toh(f->header->n_fields) == 0)
2914                 return true;
2915
2916         if (max_file_usec > 0) {
2917                 usec_t t, h;
2918
2919                 h = le64toh(f->header->head_entry_realtime);
2920                 t = now(CLOCK_REALTIME);
2921
2922                 if (h > 0 && t > h + max_file_usec)
2923                         return true;
2924         }
2925
2926         return false;
2927 }