chiark / gitweb /
journal/compress: return early in uncompress_startswith
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29 #include <sys/xattr.h>
30
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
34 #include "lookup3.h"
35 #include "compress.h"
36 #include "fsprg.h"
37
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
45
46 /* These are the lower and upper bounds if we deduce the max_use value
47  * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
50
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
53
54 /* This is the upper bound if we deduce the keep_free value from the
55  * file system size */
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58 /* This is the keep_free value when we can't determine the system
59  * size */
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
61
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
67
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
70
71 static int journal_file_set_online(JournalFile *f) {
72         assert(f);
73
74         if (!f->writable)
75                 return -EPERM;
76
77         if (!(f->fd >= 0 && f->header))
78                 return -EINVAL;
79
80         switch(f->header->state) {
81                 case STATE_ONLINE:
82                         return 0;
83
84                 case STATE_OFFLINE:
85                         f->header->state = STATE_ONLINE;
86                         fsync(f->fd);
87                         return 0;
88
89                 default:
90                         return -EINVAL;
91         }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95         assert(f);
96
97         if (!f->writable)
98                 return -EPERM;
99
100         if (!(f->fd >= 0 && f->header))
101                 return -EINVAL;
102
103         if (f->header->state != STATE_ONLINE)
104                 return 0;
105
106         fsync(f->fd);
107
108         f->header->state = STATE_OFFLINE;
109
110         fsync(f->fd);
111
112         return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116         assert(f);
117
118 #ifdef HAVE_GCRYPT
119         /* Write the final tag */
120         if (f->seal && f->writable)
121                 journal_file_append_tag(f);
122 #endif
123
124         /* Sync everything to disk, before we mark the file offline */
125         if (f->mmap && f->fd >= 0)
126                 mmap_cache_close_fd(f->mmap, f->fd);
127
128         journal_file_set_offline(f);
129
130         if (f->header)
131                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133         safe_close(f->fd);
134         free(f->path);
135
136         if (f->mmap)
137                 mmap_cache_unref(f->mmap);
138
139         hashmap_free_free(f->chain_cache);
140
141 #ifdef HAVE_XZ
142         free(f->compress_buffer);
143 #endif
144
145 #ifdef HAVE_GCRYPT
146         if (f->fss_file)
147                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148         else if (f->fsprg_state)
149                 free(f->fsprg_state);
150
151         free(f->fsprg_seed);
152
153         if (f->hmac)
154                 gcry_md_close(f->hmac);
155 #endif
156
157         free(f);
158 }
159
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
161         Header h;
162         ssize_t k;
163         int r;
164
165         assert(f);
166
167         zero(h);
168         memcpy(h.signature, HEADER_SIGNATURE, 8);
169         h.header_size = htole64(ALIGN64(sizeof(h)));
170
171         h.incompatible_flags =
172                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
173
174         h.compatible_flags =
175                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
176
177         r = sd_id128_randomize(&h.file_id);
178         if (r < 0)
179                 return r;
180
181         if (template) {
182                 h.seqnum_id = template->header->seqnum_id;
183                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
184         } else
185                 h.seqnum_id = h.file_id;
186
187         k = pwrite(f->fd, &h, sizeof(h), 0);
188         if (k < 0)
189                 return -errno;
190
191         if (k != sizeof(h))
192                 return -EIO;
193
194         return 0;
195 }
196
197 static int journal_file_refresh_header(JournalFile *f) {
198         int r;
199         sd_id128_t boot_id;
200
201         assert(f);
202
203         r = sd_id128_get_machine(&f->header->machine_id);
204         if (r < 0)
205                 return r;
206
207         r = sd_id128_get_boot(&boot_id);
208         if (r < 0)
209                 return r;
210
211         if (sd_id128_equal(boot_id, f->header->boot_id))
212                 f->tail_entry_monotonic_valid = true;
213
214         f->header->boot_id = boot_id;
215
216         journal_file_set_online(f);
217
218         /* Sync the online state to disk */
219         fsync(f->fd);
220
221         return 0;
222 }
223
224 static int journal_file_verify_header(JournalFile *f) {
225         assert(f);
226
227         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
228                 return -EBADMSG;
229
230         /* In both read and write mode we refuse to open files with
231          * incompatible flags we don't know */
232 #ifdef HAVE_XZ
233         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
234                 return -EPROTONOSUPPORT;
235 #else
236         if (f->header->incompatible_flags != 0)
237                 return -EPROTONOSUPPORT;
238 #endif
239
240         /* When open for writing we refuse to open files with
241          * compatible flags, too */
242         if (f->writable) {
243 #ifdef HAVE_GCRYPT
244                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
245                         return -EPROTONOSUPPORT;
246 #else
247                 if (f->header->compatible_flags != 0)
248                         return -EPROTONOSUPPORT;
249 #endif
250         }
251
252         if (f->header->state >= _STATE_MAX)
253                 return -EBADMSG;
254
255         /* The first addition was n_data, so check that we are at least this large */
256         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
257                 return -EBADMSG;
258
259         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
260                 return -EBADMSG;
261
262         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
263                 return -ENODATA;
264
265         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
266                 return -ENODATA;
267
268         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
269             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
270             !VALID64(le64toh(f->header->tail_object_offset)) ||
271             !VALID64(le64toh(f->header->entry_array_offset)))
272                 return -ENODATA;
273
274         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
275             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
276             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
277             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
278                 return -ENODATA;
279
280         if (f->writable) {
281                 uint8_t state;
282                 sd_id128_t machine_id;
283                 int r;
284
285                 r = sd_id128_get_machine(&machine_id);
286                 if (r < 0)
287                         return r;
288
289                 if (!sd_id128_equal(machine_id, f->header->machine_id))
290                         return -EHOSTDOWN;
291
292                 state = f->header->state;
293
294                 if (state == STATE_ONLINE) {
295                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
296                         return -EBUSY;
297                 } else if (state == STATE_ARCHIVED)
298                         return -ESHUTDOWN;
299                 else if (state != STATE_OFFLINE) {
300                         log_debug("Journal file %s has unknown state %u.", f->path, state);
301                         return -EBUSY;
302                 }
303         }
304
305         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
306
307         f->seal = JOURNAL_HEADER_SEALED(f->header);
308
309         return 0;
310 }
311
312 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
313         uint64_t old_size, new_size;
314         int r;
315
316         assert(f);
317
318         /* We assume that this file is not sparse, and we know that
319          * for sure, since we always call posix_fallocate()
320          * ourselves */
321
322         old_size =
323                 le64toh(f->header->header_size) +
324                 le64toh(f->header->arena_size);
325
326         new_size = PAGE_ALIGN(offset + size);
327         if (new_size < le64toh(f->header->header_size))
328                 new_size = le64toh(f->header->header_size);
329
330         if (new_size <= old_size)
331                 return 0;
332
333         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
334                 return -E2BIG;
335
336         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
337                 struct statvfs svfs;
338
339                 if (fstatvfs(f->fd, &svfs) >= 0) {
340                         uint64_t available;
341
342                         available = svfs.f_bfree * svfs.f_bsize;
343
344                         if (available >= f->metrics.keep_free)
345                                 available -= f->metrics.keep_free;
346                         else
347                                 available = 0;
348
349                         if (new_size - old_size > available)
350                                 return -E2BIG;
351                 }
352         }
353
354         /* Increase by larger blocks at once */
355         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
356         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
357                 new_size = f->metrics.max_size;
358
359         /* Note that the glibc fallocate() fallback is very
360            inefficient, hence we try to minimize the allocation area
361            as we can. */
362         r = posix_fallocate(f->fd, old_size, new_size - old_size);
363         if (r != 0)
364                 return -r;
365
366         if (fstat(f->fd, &f->last_stat) < 0)
367                 return -errno;
368
369         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
370
371         return 0;
372 }
373
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
375         assert(f);
376         assert(ret);
377
378         if (size <= 0)
379                 return -EINVAL;
380
381         /* Avoid SIGBUS on invalid accesses */
382         if (offset + size > (uint64_t) f->last_stat.st_size) {
383                 /* Hmm, out of range? Let's refresh the fstat() data
384                  * first, before we trust that check. */
385
386                 if (fstat(f->fd, &f->last_stat) < 0 ||
387                     offset + size > (uint64_t) f->last_stat.st_size)
388                         return -EADDRNOTAVAIL;
389         }
390
391         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
392 }
393
394 static uint64_t minimum_header_size(Object *o) {
395
396         static const uint64_t table[] = {
397                 [OBJECT_DATA] = sizeof(DataObject),
398                 [OBJECT_FIELD] = sizeof(FieldObject),
399                 [OBJECT_ENTRY] = sizeof(EntryObject),
400                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403                 [OBJECT_TAG] = sizeof(TagObject),
404         };
405
406         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407                 return sizeof(ObjectHeader);
408
409         return table[o->object.type];
410 }
411
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
413         int r;
414         void *t;
415         Object *o;
416         uint64_t s;
417
418         assert(f);
419         assert(ret);
420
421         /* Objects may only be located at multiple of 64 bit */
422         if (!VALID64(offset))
423                 return -EFAULT;
424
425
426         r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
427         if (r < 0)
428                 return r;
429
430         o = (Object*) t;
431         s = le64toh(o->object.size);
432
433         if (s < sizeof(ObjectHeader))
434                 return -EBADMSG;
435
436         if (o->object.type <= OBJECT_UNUSED)
437                 return -EBADMSG;
438
439         if (s < minimum_header_size(o))
440                 return -EBADMSG;
441
442         if (type > 0 && o->object.type != type)
443                 return -EBADMSG;
444
445         if (s > sizeof(ObjectHeader)) {
446                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
447                 if (r < 0)
448                         return r;
449
450                 o = (Object*) t;
451         }
452
453         *ret = o;
454         return 0;
455 }
456
457 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
458         uint64_t r;
459
460         assert(f);
461
462         r = le64toh(f->header->tail_entry_seqnum) + 1;
463
464         if (seqnum) {
465                 /* If an external seqnum counter was passed, we update
466                  * both the local and the external one, and set it to
467                  * the maximum of both */
468
469                 if (*seqnum + 1 > r)
470                         r = *seqnum + 1;
471
472                 *seqnum = r;
473         }
474
475         f->header->tail_entry_seqnum = htole64(r);
476
477         if (f->header->head_entry_seqnum == 0)
478                 f->header->head_entry_seqnum = htole64(r);
479
480         return r;
481 }
482
483 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
484         int r;
485         uint64_t p;
486         Object *tail, *o;
487         void *t;
488
489         assert(f);
490         assert(type > 0 && type < _OBJECT_TYPE_MAX);
491         assert(size >= sizeof(ObjectHeader));
492         assert(offset);
493         assert(ret);
494
495         r = journal_file_set_online(f);
496         if (r < 0)
497                 return r;
498
499         p = le64toh(f->header->tail_object_offset);
500         if (p == 0)
501                 p = le64toh(f->header->header_size);
502         else {
503                 r = journal_file_move_to_object(f, -1, p, &tail);
504                 if (r < 0)
505                         return r;
506
507                 p += ALIGN64(le64toh(tail->object.size));
508         }
509
510         r = journal_file_allocate(f, p, size);
511         if (r < 0)
512                 return r;
513
514         r = journal_file_move_to(f, type, false, p, size, &t);
515         if (r < 0)
516                 return r;
517
518         o = (Object*) t;
519
520         zero(o->object);
521         o->object.type = type;
522         o->object.size = htole64(size);
523
524         f->header->tail_object_offset = htole64(p);
525         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
526
527         *ret = o;
528         *offset = p;
529
530         return 0;
531 }
532
533 static int journal_file_setup_data_hash_table(JournalFile *f) {
534         uint64_t s, p;
535         Object *o;
536         int r;
537
538         assert(f);
539
540         /* We estimate that we need 1 hash table entry per 768 of
541            journal file and we want to make sure we never get beyond
542            75% fill level. Calculate the hash table size for the
543            maximum file size based on these metrics. */
544
545         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
546         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
547                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
548
549         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
550
551         r = journal_file_append_object(f,
552                                        OBJECT_DATA_HASH_TABLE,
553                                        offsetof(Object, hash_table.items) + s,
554                                        &o, &p);
555         if (r < 0)
556                 return r;
557
558         memzero(o->hash_table.items, s);
559
560         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
561         f->header->data_hash_table_size = htole64(s);
562
563         return 0;
564 }
565
566 static int journal_file_setup_field_hash_table(JournalFile *f) {
567         uint64_t s, p;
568         Object *o;
569         int r;
570
571         assert(f);
572
573         /* We use a fixed size hash table for the fields as this
574          * number should grow very slowly only */
575
576         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
577         r = journal_file_append_object(f,
578                                        OBJECT_FIELD_HASH_TABLE,
579                                        offsetof(Object, hash_table.items) + s,
580                                        &o, &p);
581         if (r < 0)
582                 return r;
583
584         memzero(o->hash_table.items, s);
585
586         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
587         f->header->field_hash_table_size = htole64(s);
588
589         return 0;
590 }
591
592 static int journal_file_map_data_hash_table(JournalFile *f) {
593         uint64_t s, p;
594         void *t;
595         int r;
596
597         assert(f);
598
599         p = le64toh(f->header->data_hash_table_offset);
600         s = le64toh(f->header->data_hash_table_size);
601
602         r = journal_file_move_to(f,
603                                  OBJECT_DATA_HASH_TABLE,
604                                  true,
605                                  p, s,
606                                  &t);
607         if (r < 0)
608                 return r;
609
610         f->data_hash_table = t;
611         return 0;
612 }
613
614 static int journal_file_map_field_hash_table(JournalFile *f) {
615         uint64_t s, p;
616         void *t;
617         int r;
618
619         assert(f);
620
621         p = le64toh(f->header->field_hash_table_offset);
622         s = le64toh(f->header->field_hash_table_size);
623
624         r = journal_file_move_to(f,
625                                  OBJECT_FIELD_HASH_TABLE,
626                                  true,
627                                  p, s,
628                                  &t);
629         if (r < 0)
630                 return r;
631
632         f->field_hash_table = t;
633         return 0;
634 }
635
636 static int journal_file_link_field(
637                 JournalFile *f,
638                 Object *o,
639                 uint64_t offset,
640                 uint64_t hash) {
641
642         uint64_t p, h;
643         int r;
644
645         assert(f);
646         assert(o);
647         assert(offset > 0);
648
649         if (o->object.type != OBJECT_FIELD)
650                 return -EINVAL;
651
652         /* This might alter the window we are looking at */
653
654         o->field.next_hash_offset = o->field.head_data_offset = 0;
655
656         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
657         p = le64toh(f->field_hash_table[h].tail_hash_offset);
658         if (p == 0)
659                 f->field_hash_table[h].head_hash_offset = htole64(offset);
660         else {
661                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
662                 if (r < 0)
663                         return r;
664
665                 o->field.next_hash_offset = htole64(offset);
666         }
667
668         f->field_hash_table[h].tail_hash_offset = htole64(offset);
669
670         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
671                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
672
673         return 0;
674 }
675
676 static int journal_file_link_data(
677                 JournalFile *f,
678                 Object *o,
679                 uint64_t offset,
680                 uint64_t hash) {
681
682         uint64_t p, h;
683         int r;
684
685         assert(f);
686         assert(o);
687         assert(offset > 0);
688
689         if (o->object.type != OBJECT_DATA)
690                 return -EINVAL;
691
692         /* This might alter the window we are looking at */
693
694         o->data.next_hash_offset = o->data.next_field_offset = 0;
695         o->data.entry_offset = o->data.entry_array_offset = 0;
696         o->data.n_entries = 0;
697
698         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
699         p = le64toh(f->data_hash_table[h].tail_hash_offset);
700         if (p == 0)
701                 /* Only entry in the hash table is easy */
702                 f->data_hash_table[h].head_hash_offset = htole64(offset);
703         else {
704                 /* Move back to the previous data object, to patch in
705                  * pointer */
706
707                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
708                 if (r < 0)
709                         return r;
710
711                 o->data.next_hash_offset = htole64(offset);
712         }
713
714         f->data_hash_table[h].tail_hash_offset = htole64(offset);
715
716         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
717                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
718
719         return 0;
720 }
721
722 int journal_file_find_field_object_with_hash(
723                 JournalFile *f,
724                 const void *field, uint64_t size, uint64_t hash,
725                 Object **ret, uint64_t *offset) {
726
727         uint64_t p, osize, h;
728         int r;
729
730         assert(f);
731         assert(field && size > 0);
732
733         osize = offsetof(Object, field.payload) + size;
734
735         if (f->header->field_hash_table_size == 0)
736                 return -EBADMSG;
737
738         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
739         p = le64toh(f->field_hash_table[h].head_hash_offset);
740
741         while (p > 0) {
742                 Object *o;
743
744                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
745                 if (r < 0)
746                         return r;
747
748                 if (le64toh(o->field.hash) == hash &&
749                     le64toh(o->object.size) == osize &&
750                     memcmp(o->field.payload, field, size) == 0) {
751
752                         if (ret)
753                                 *ret = o;
754                         if (offset)
755                                 *offset = p;
756
757                         return 1;
758                 }
759
760                 p = le64toh(o->field.next_hash_offset);
761         }
762
763         return 0;
764 }
765
766 int journal_file_find_field_object(
767                 JournalFile *f,
768                 const void *field, uint64_t size,
769                 Object **ret, uint64_t *offset) {
770
771         uint64_t hash;
772
773         assert(f);
774         assert(field && size > 0);
775
776         hash = hash64(field, size);
777
778         return journal_file_find_field_object_with_hash(f,
779                                                         field, size, hash,
780                                                         ret, offset);
781 }
782
783 int journal_file_find_data_object_with_hash(
784                 JournalFile *f,
785                 const void *data, uint64_t size, uint64_t hash,
786                 Object **ret, uint64_t *offset) {
787
788         uint64_t p, osize, h;
789         int r;
790
791         assert(f);
792         assert(data || size == 0);
793
794         osize = offsetof(Object, data.payload) + size;
795
796         if (f->header->data_hash_table_size == 0)
797                 return -EBADMSG;
798
799         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
800         p = le64toh(f->data_hash_table[h].head_hash_offset);
801
802         while (p > 0) {
803                 Object *o;
804
805                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
806                 if (r < 0)
807                         return r;
808
809                 if (le64toh(o->data.hash) != hash)
810                         goto next;
811
812                 if (o->object.flags & OBJECT_COMPRESSED) {
813 #ifdef HAVE_XZ
814                         uint64_t l, rsize;
815
816                         l = le64toh(o->object.size);
817                         if (l <= offsetof(Object, data.payload))
818                                 return -EBADMSG;
819
820                         l -= offsetof(Object, data.payload);
821
822                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
823                                 return -EBADMSG;
824
825                         if (rsize == size &&
826                             memcmp(f->compress_buffer, data, size) == 0) {
827
828                                 if (ret)
829                                         *ret = o;
830
831                                 if (offset)
832                                         *offset = p;
833
834                                 return 1;
835                         }
836 #else
837                         return -EPROTONOSUPPORT;
838 #endif
839
840                 } else if (le64toh(o->object.size) == osize &&
841                            memcmp(o->data.payload, data, size) == 0) {
842
843                         if (ret)
844                                 *ret = o;
845
846                         if (offset)
847                                 *offset = p;
848
849                         return 1;
850                 }
851
852         next:
853                 p = le64toh(o->data.next_hash_offset);
854         }
855
856         return 0;
857 }
858
859 int journal_file_find_data_object(
860                 JournalFile *f,
861                 const void *data, uint64_t size,
862                 Object **ret, uint64_t *offset) {
863
864         uint64_t hash;
865
866         assert(f);
867         assert(data || size == 0);
868
869         hash = hash64(data, size);
870
871         return journal_file_find_data_object_with_hash(f,
872                                                        data, size, hash,
873                                                        ret, offset);
874 }
875
876 static int journal_file_append_field(
877                 JournalFile *f,
878                 const void *field, uint64_t size,
879                 Object **ret, uint64_t *offset) {
880
881         uint64_t hash, p;
882         uint64_t osize;
883         Object *o;
884         int r;
885
886         assert(f);
887         assert(field && size > 0);
888
889         hash = hash64(field, size);
890
891         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
892         if (r < 0)
893                 return r;
894         else if (r > 0) {
895
896                 if (ret)
897                         *ret = o;
898
899                 if (offset)
900                         *offset = p;
901
902                 return 0;
903         }
904
905         osize = offsetof(Object, field.payload) + size;
906         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
907         if (r < 0)
908                 return r;
909
910         o->field.hash = htole64(hash);
911         memcpy(o->field.payload, field, size);
912
913         r = journal_file_link_field(f, o, p, hash);
914         if (r < 0)
915                 return r;
916
917         /* The linking might have altered the window, so let's
918          * refresh our pointer */
919         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
920         if (r < 0)
921                 return r;
922
923 #ifdef HAVE_GCRYPT
924         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
925         if (r < 0)
926                 return r;
927 #endif
928
929         if (ret)
930                 *ret = o;
931
932         if (offset)
933                 *offset = p;
934
935         return 0;
936 }
937
938 static int journal_file_append_data(
939                 JournalFile *f,
940                 const void *data, uint64_t size,
941                 Object **ret, uint64_t *offset) {
942
943         uint64_t hash, p;
944         uint64_t osize;
945         Object *o;
946         int r;
947         bool compressed = false;
948         const void *eq;
949
950         assert(f);
951         assert(data || size == 0);
952
953         hash = hash64(data, size);
954
955         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
956         if (r < 0)
957                 return r;
958         else if (r > 0) {
959
960                 if (ret)
961                         *ret = o;
962
963                 if (offset)
964                         *offset = p;
965
966                 return 0;
967         }
968
969         osize = offsetof(Object, data.payload) + size;
970         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
971         if (r < 0)
972                 return r;
973
974         o->data.hash = htole64(hash);
975
976 #ifdef HAVE_XZ
977         if (f->compress &&
978             size >= COMPRESSION_SIZE_THRESHOLD) {
979                 uint64_t rsize;
980
981                 compressed = compress_blob(data, size, o->data.payload, &rsize);
982
983                 if (compressed) {
984                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
985                         o->object.flags |= OBJECT_COMPRESSED;
986
987                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
988                 }
989         }
990 #endif
991
992         if (!compressed && size > 0)
993                 memcpy(o->data.payload, data, size);
994
995         r = journal_file_link_data(f, o, p, hash);
996         if (r < 0)
997                 return r;
998
999         /* The linking might have altered the window, so let's
1000          * refresh our pointer */
1001         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1002         if (r < 0)
1003                 return r;
1004
1005         if (!data)
1006                 eq = NULL;
1007         else
1008                 eq = memchr(data, '=', size);
1009         if (eq && eq > data) {
1010                 Object *fo = NULL;
1011                 uint64_t fp;
1012
1013                 /* Create field object ... */
1014                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1015                 if (r < 0)
1016                         return r;
1017
1018                 /* ... and link it in. */
1019                 o->data.next_field_offset = fo->field.head_data_offset;
1020                 fo->field.head_data_offset = le64toh(p);
1021         }
1022
1023 #ifdef HAVE_GCRYPT
1024         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1025         if (r < 0)
1026                 return r;
1027 #endif
1028
1029         if (ret)
1030                 *ret = o;
1031
1032         if (offset)
1033                 *offset = p;
1034
1035         return 0;
1036 }
1037
1038 uint64_t journal_file_entry_n_items(Object *o) {
1039         assert(o);
1040
1041         if (o->object.type != OBJECT_ENTRY)
1042                 return 0;
1043
1044         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1045 }
1046
1047 uint64_t journal_file_entry_array_n_items(Object *o) {
1048         assert(o);
1049
1050         if (o->object.type != OBJECT_ENTRY_ARRAY)
1051                 return 0;
1052
1053         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1054 }
1055
1056 uint64_t journal_file_hash_table_n_items(Object *o) {
1057         assert(o);
1058
1059         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1060             o->object.type != OBJECT_FIELD_HASH_TABLE)
1061                 return 0;
1062
1063         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1064 }
1065
1066 static int link_entry_into_array(JournalFile *f,
1067                                  le64_t *first,
1068                                  le64_t *idx,
1069                                  uint64_t p) {
1070         int r;
1071         uint64_t n = 0, ap = 0, q, i, a, hidx;
1072         Object *o;
1073
1074         assert(f);
1075         assert(first);
1076         assert(idx);
1077         assert(p > 0);
1078
1079         a = le64toh(*first);
1080         i = hidx = le64toh(*idx);
1081         while (a > 0) {
1082
1083                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1084                 if (r < 0)
1085                         return r;
1086
1087                 n = journal_file_entry_array_n_items(o);
1088                 if (i < n) {
1089                         o->entry_array.items[i] = htole64(p);
1090                         *idx = htole64(hidx + 1);
1091                         return 0;
1092                 }
1093
1094                 i -= n;
1095                 ap = a;
1096                 a = le64toh(o->entry_array.next_entry_array_offset);
1097         }
1098
1099         if (hidx > n)
1100                 n = (hidx+1) * 2;
1101         else
1102                 n = n * 2;
1103
1104         if (n < 4)
1105                 n = 4;
1106
1107         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1108                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1109                                        &o, &q);
1110         if (r < 0)
1111                 return r;
1112
1113 #ifdef HAVE_GCRYPT
1114         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1115         if (r < 0)
1116                 return r;
1117 #endif
1118
1119         o->entry_array.items[i] = htole64(p);
1120
1121         if (ap == 0)
1122                 *first = htole64(q);
1123         else {
1124                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1125                 if (r < 0)
1126                         return r;
1127
1128                 o->entry_array.next_entry_array_offset = htole64(q);
1129         }
1130
1131         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1132                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1133
1134         *idx = htole64(hidx + 1);
1135
1136         return 0;
1137 }
1138
1139 static int link_entry_into_array_plus_one(JournalFile *f,
1140                                           le64_t *extra,
1141                                           le64_t *first,
1142                                           le64_t *idx,
1143                                           uint64_t p) {
1144
1145         int r;
1146
1147         assert(f);
1148         assert(extra);
1149         assert(first);
1150         assert(idx);
1151         assert(p > 0);
1152
1153         if (*idx == 0)
1154                 *extra = htole64(p);
1155         else {
1156                 le64_t i;
1157
1158                 i = htole64(le64toh(*idx) - 1);
1159                 r = link_entry_into_array(f, first, &i, p);
1160                 if (r < 0)
1161                         return r;
1162         }
1163
1164         *idx = htole64(le64toh(*idx) + 1);
1165         return 0;
1166 }
1167
1168 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1169         uint64_t p;
1170         int r;
1171         assert(f);
1172         assert(o);
1173         assert(offset > 0);
1174
1175         p = le64toh(o->entry.items[i].object_offset);
1176         if (p == 0)
1177                 return -EINVAL;
1178
1179         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1180         if (r < 0)
1181                 return r;
1182
1183         return link_entry_into_array_plus_one(f,
1184                                               &o->data.entry_offset,
1185                                               &o->data.entry_array_offset,
1186                                               &o->data.n_entries,
1187                                               offset);
1188 }
1189
1190 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1191         uint64_t n, i;
1192         int r;
1193
1194         assert(f);
1195         assert(o);
1196         assert(offset > 0);
1197
1198         if (o->object.type != OBJECT_ENTRY)
1199                 return -EINVAL;
1200
1201         __sync_synchronize();
1202
1203         /* Link up the entry itself */
1204         r = link_entry_into_array(f,
1205                                   &f->header->entry_array_offset,
1206                                   &f->header->n_entries,
1207                                   offset);
1208         if (r < 0)
1209                 return r;
1210
1211         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1212
1213         if (f->header->head_entry_realtime == 0)
1214                 f->header->head_entry_realtime = o->entry.realtime;
1215
1216         f->header->tail_entry_realtime = o->entry.realtime;
1217         f->header->tail_entry_monotonic = o->entry.monotonic;
1218
1219         f->tail_entry_monotonic_valid = true;
1220
1221         /* Link up the items */
1222         n = journal_file_entry_n_items(o);
1223         for (i = 0; i < n; i++) {
1224                 r = journal_file_link_entry_item(f, o, offset, i);
1225                 if (r < 0)
1226                         return r;
1227         }
1228
1229         return 0;
1230 }
1231
1232 static int journal_file_append_entry_internal(
1233                 JournalFile *f,
1234                 const dual_timestamp *ts,
1235                 uint64_t xor_hash,
1236                 const EntryItem items[], unsigned n_items,
1237                 uint64_t *seqnum,
1238                 Object **ret, uint64_t *offset) {
1239         uint64_t np;
1240         uint64_t osize;
1241         Object *o;
1242         int r;
1243
1244         assert(f);
1245         assert(items || n_items == 0);
1246         assert(ts);
1247
1248         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1249
1250         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1251         if (r < 0)
1252                 return r;
1253
1254         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1255         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1256         o->entry.realtime = htole64(ts->realtime);
1257         o->entry.monotonic = htole64(ts->monotonic);
1258         o->entry.xor_hash = htole64(xor_hash);
1259         o->entry.boot_id = f->header->boot_id;
1260
1261 #ifdef HAVE_GCRYPT
1262         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1263         if (r < 0)
1264                 return r;
1265 #endif
1266
1267         r = journal_file_link_entry(f, o, np);
1268         if (r < 0)
1269                 return r;
1270
1271         if (ret)
1272                 *ret = o;
1273
1274         if (offset)
1275                 *offset = np;
1276
1277         return 0;
1278 }
1279
1280 void journal_file_post_change(JournalFile *f) {
1281         assert(f);
1282
1283         /* inotify() does not receive IN_MODIFY events from file
1284          * accesses done via mmap(). After each access we hence
1285          * trigger IN_MODIFY by truncating the journal file to its
1286          * current size which triggers IN_MODIFY. */
1287
1288         __sync_synchronize();
1289
1290         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1291                 log_error("Failed to truncate file to its own size: %m");
1292 }
1293
1294 static int entry_item_cmp(const void *_a, const void *_b) {
1295         const EntryItem *a = _a, *b = _b;
1296
1297         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1298                 return -1;
1299         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1300                 return 1;
1301         return 0;
1302 }
1303
1304 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1305         unsigned i;
1306         EntryItem *items;
1307         int r;
1308         uint64_t xor_hash = 0;
1309         struct dual_timestamp _ts;
1310
1311         assert(f);
1312         assert(iovec || n_iovec == 0);
1313
1314         if (!ts) {
1315                 dual_timestamp_get(&_ts);
1316                 ts = &_ts;
1317         }
1318
1319         if (f->tail_entry_monotonic_valid &&
1320             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1321                 return -EINVAL;
1322
1323 #ifdef HAVE_GCRYPT
1324         r = journal_file_maybe_append_tag(f, ts->realtime);
1325         if (r < 0)
1326                 return r;
1327 #endif
1328
1329         /* alloca() can't take 0, hence let's allocate at least one */
1330         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1331
1332         for (i = 0; i < n_iovec; i++) {
1333                 uint64_t p;
1334                 Object *o;
1335
1336                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1337                 if (r < 0)
1338                         return r;
1339
1340                 xor_hash ^= le64toh(o->data.hash);
1341                 items[i].object_offset = htole64(p);
1342                 items[i].hash = o->data.hash;
1343         }
1344
1345         /* Order by the position on disk, in order to improve seek
1346          * times for rotating media. */
1347         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1348
1349         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1350
1351         journal_file_post_change(f);
1352
1353         return r;
1354 }
1355
1356 typedef struct ChainCacheItem {
1357         uint64_t first; /* the array at the beginning of the chain */
1358         uint64_t array; /* the cached array */
1359         uint64_t begin; /* the first item in the cached array */
1360         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1361         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1362 } ChainCacheItem;
1363
1364 static void chain_cache_put(
1365                 Hashmap *h,
1366                 ChainCacheItem *ci,
1367                 uint64_t first,
1368                 uint64_t array,
1369                 uint64_t begin,
1370                 uint64_t total,
1371                 uint64_t last_index) {
1372
1373         if (!ci) {
1374                 /* If the chain item to cache for this chain is the
1375                  * first one it's not worth caching anything */
1376                 if (array == first)
1377                         return;
1378
1379                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1380                         ci = hashmap_steal_first(h);
1381                 else {
1382                         ci = new(ChainCacheItem, 1);
1383                         if (!ci)
1384                                 return;
1385                 }
1386
1387                 ci->first = first;
1388
1389                 if (hashmap_put(h, &ci->first, ci) < 0) {
1390                         free(ci);
1391                         return;
1392                 }
1393         } else
1394                 assert(ci->first == first);
1395
1396         ci->array = array;
1397         ci->begin = begin;
1398         ci->total = total;
1399         ci->last_index = last_index;
1400 }
1401
1402 static int generic_array_get(
1403                 JournalFile *f,
1404                 uint64_t first,
1405                 uint64_t i,
1406                 Object **ret, uint64_t *offset) {
1407
1408         Object *o;
1409         uint64_t p = 0, a, t = 0;
1410         int r;
1411         ChainCacheItem *ci;
1412
1413         assert(f);
1414
1415         a = first;
1416
1417         /* Try the chain cache first */
1418         ci = hashmap_get(f->chain_cache, &first);
1419         if (ci && i > ci->total) {
1420                 a = ci->array;
1421                 i -= ci->total;
1422                 t = ci->total;
1423         }
1424
1425         while (a > 0) {
1426                 uint64_t k;
1427
1428                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1429                 if (r < 0)
1430                         return r;
1431
1432                 k = journal_file_entry_array_n_items(o);
1433                 if (i < k) {
1434                         p = le64toh(o->entry_array.items[i]);
1435                         goto found;
1436                 }
1437
1438                 i -= k;
1439                 t += k;
1440                 a = le64toh(o->entry_array.next_entry_array_offset);
1441         }
1442
1443         return 0;
1444
1445 found:
1446         /* Let's cache this item for the next invocation */
1447         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1448
1449         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1450         if (r < 0)
1451                 return r;
1452
1453         if (ret)
1454                 *ret = o;
1455
1456         if (offset)
1457                 *offset = p;
1458
1459         return 1;
1460 }
1461
1462 static int generic_array_get_plus_one(
1463                 JournalFile *f,
1464                 uint64_t extra,
1465                 uint64_t first,
1466                 uint64_t i,
1467                 Object **ret, uint64_t *offset) {
1468
1469         Object *o;
1470
1471         assert(f);
1472
1473         if (i == 0) {
1474                 int r;
1475
1476                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1477                 if (r < 0)
1478                         return r;
1479
1480                 if (ret)
1481                         *ret = o;
1482
1483                 if (offset)
1484                         *offset = extra;
1485
1486                 return 1;
1487         }
1488
1489         return generic_array_get(f, first, i-1, ret, offset);
1490 }
1491
1492 enum {
1493         TEST_FOUND,
1494         TEST_LEFT,
1495         TEST_RIGHT
1496 };
1497
1498 static int generic_array_bisect(
1499                 JournalFile *f,
1500                 uint64_t first,
1501                 uint64_t n,
1502                 uint64_t needle,
1503                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1504                 direction_t direction,
1505                 Object **ret,
1506                 uint64_t *offset,
1507                 uint64_t *idx) {
1508
1509         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1510         bool subtract_one = false;
1511         Object *o, *array = NULL;
1512         int r;
1513         ChainCacheItem *ci;
1514
1515         assert(f);
1516         assert(test_object);
1517
1518         /* Start with the first array in the chain */
1519         a = first;
1520
1521         ci = hashmap_get(f->chain_cache, &first);
1522         if (ci && n > ci->total) {
1523                 /* Ah, we have iterated this bisection array chain
1524                  * previously! Let's see if we can skip ahead in the
1525                  * chain, as far as the last time. But we can't jump
1526                  * backwards in the chain, so let's check that
1527                  * first. */
1528
1529                 r = test_object(f, ci->begin, needle);
1530                 if (r < 0)
1531                         return r;
1532
1533                 if (r == TEST_LEFT) {
1534                         /* OK, what we are looking for is right of the
1535                          * begin of this EntryArray, so let's jump
1536                          * straight to previously cached array in the
1537                          * chain */
1538
1539                         a = ci->array;
1540                         n -= ci->total;
1541                         t = ci->total;
1542                         last_index = ci->last_index;
1543                 }
1544         }
1545
1546         while (a > 0) {
1547                 uint64_t left, right, k, lp;
1548
1549                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1550                 if (r < 0)
1551                         return r;
1552
1553                 k = journal_file_entry_array_n_items(array);
1554                 right = MIN(k, n);
1555                 if (right <= 0)
1556                         return 0;
1557
1558                 i = right - 1;
1559                 lp = p = le64toh(array->entry_array.items[i]);
1560                 if (p <= 0)
1561                         return -EBADMSG;
1562
1563                 r = test_object(f, p, needle);
1564                 if (r < 0)
1565                         return r;
1566
1567                 if (r == TEST_FOUND)
1568                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1569
1570                 if (r == TEST_RIGHT) {
1571                         left = 0;
1572                         right -= 1;
1573
1574                         if (last_index != (uint64_t) -1) {
1575                                 assert(last_index <= right);
1576
1577                                 /* If we cached the last index we
1578                                  * looked at, let's try to not to jump
1579                                  * too wildly around and see if we can
1580                                  * limit the range to look at early to
1581                                  * the immediate neighbors of the last
1582                                  * index we looked at. */
1583
1584                                 if (last_index > 0) {
1585                                         uint64_t x = last_index - 1;
1586
1587                                         p = le64toh(array->entry_array.items[x]);
1588                                         if (p <= 0)
1589                                                 return -EBADMSG;
1590
1591                                         r = test_object(f, p, needle);
1592                                         if (r < 0)
1593                                                 return r;
1594
1595                                         if (r == TEST_FOUND)
1596                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1597
1598                                         if (r == TEST_RIGHT)
1599                                                 right = x;
1600                                         else
1601                                                 left = x + 1;
1602                                 }
1603
1604                                 if (last_index < right) {
1605                                         uint64_t y = last_index + 1;
1606
1607                                         p = le64toh(array->entry_array.items[y]);
1608                                         if (p <= 0)
1609                                                 return -EBADMSG;
1610
1611                                         r = test_object(f, p, needle);
1612                                         if (r < 0)
1613                                                 return r;
1614
1615                                         if (r == TEST_FOUND)
1616                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1617
1618                                         if (r == TEST_RIGHT)
1619                                                 right = y;
1620                                         else
1621                                                 left = y + 1;
1622                                 }
1623                         }
1624
1625                         for (;;) {
1626                                 if (left == right) {
1627                                         if (direction == DIRECTION_UP)
1628                                                 subtract_one = true;
1629
1630                                         i = left;
1631                                         goto found;
1632                                 }
1633
1634                                 assert(left < right);
1635                                 i = (left + right) / 2;
1636
1637                                 p = le64toh(array->entry_array.items[i]);
1638                                 if (p <= 0)
1639                                         return -EBADMSG;
1640
1641                                 r = test_object(f, p, needle);
1642                                 if (r < 0)
1643                                         return r;
1644
1645                                 if (r == TEST_FOUND)
1646                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1647
1648                                 if (r == TEST_RIGHT)
1649                                         right = i;
1650                                 else
1651                                         left = i + 1;
1652                         }
1653                 }
1654
1655                 if (k > n) {
1656                         if (direction == DIRECTION_UP) {
1657                                 i = n;
1658                                 subtract_one = true;
1659                                 goto found;
1660                         }
1661
1662                         return 0;
1663                 }
1664
1665                 last_p = lp;
1666
1667                 n -= k;
1668                 t += k;
1669                 last_index = (uint64_t) -1;
1670                 a = le64toh(array->entry_array.next_entry_array_offset);
1671         }
1672
1673         return 0;
1674
1675 found:
1676         if (subtract_one && t == 0 && i == 0)
1677                 return 0;
1678
1679         /* Let's cache this item for the next invocation */
1680         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1681
1682         if (subtract_one && i == 0)
1683                 p = last_p;
1684         else if (subtract_one)
1685                 p = le64toh(array->entry_array.items[i-1]);
1686         else
1687                 p = le64toh(array->entry_array.items[i]);
1688
1689         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1690         if (r < 0)
1691                 return r;
1692
1693         if (ret)
1694                 *ret = o;
1695
1696         if (offset)
1697                 *offset = p;
1698
1699         if (idx)
1700                 *idx = t + i + (subtract_one ? -1 : 0);
1701
1702         return 1;
1703 }
1704
1705
1706 static int generic_array_bisect_plus_one(
1707                 JournalFile *f,
1708                 uint64_t extra,
1709                 uint64_t first,
1710                 uint64_t n,
1711                 uint64_t needle,
1712                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1713                 direction_t direction,
1714                 Object **ret,
1715                 uint64_t *offset,
1716                 uint64_t *idx) {
1717
1718         int r;
1719         bool step_back = false;
1720         Object *o;
1721
1722         assert(f);
1723         assert(test_object);
1724
1725         if (n <= 0)
1726                 return 0;
1727
1728         /* This bisects the array in object 'first', but first checks
1729          * an extra  */
1730         r = test_object(f, extra, needle);
1731         if (r < 0)
1732                 return r;
1733
1734         if (r == TEST_FOUND)
1735                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1736
1737         /* if we are looking with DIRECTION_UP then we need to first
1738            see if in the actual array there is a matching entry, and
1739            return the last one of that. But if there isn't any we need
1740            to return this one. Hence remember this, and return it
1741            below. */
1742         if (r == TEST_LEFT)
1743                 step_back = direction == DIRECTION_UP;
1744
1745         if (r == TEST_RIGHT) {
1746                 if (direction == DIRECTION_DOWN)
1747                         goto found;
1748                 else
1749                         return 0;
1750         }
1751
1752         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1753
1754         if (r == 0 && step_back)
1755                 goto found;
1756
1757         if (r > 0 && idx)
1758                 (*idx) ++;
1759
1760         return r;
1761
1762 found:
1763         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1764         if (r < 0)
1765                 return r;
1766
1767         if (ret)
1768                 *ret = o;
1769
1770         if (offset)
1771                 *offset = extra;
1772
1773         if (idx)
1774                 *idx = 0;
1775
1776         return 1;
1777 }
1778
1779 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1780         assert(f);
1781         assert(p > 0);
1782
1783         if (p == needle)
1784                 return TEST_FOUND;
1785         else if (p < needle)
1786                 return TEST_LEFT;
1787         else
1788                 return TEST_RIGHT;
1789 }
1790
1791 int journal_file_move_to_entry_by_offset(
1792                 JournalFile *f,
1793                 uint64_t p,
1794                 direction_t direction,
1795                 Object **ret,
1796                 uint64_t *offset) {
1797
1798         return generic_array_bisect(f,
1799                                     le64toh(f->header->entry_array_offset),
1800                                     le64toh(f->header->n_entries),
1801                                     p,
1802                                     test_object_offset,
1803                                     direction,
1804                                     ret, offset, NULL);
1805 }
1806
1807
1808 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1809         Object *o;
1810         int r;
1811
1812         assert(f);
1813         assert(p > 0);
1814
1815         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1816         if (r < 0)
1817                 return r;
1818
1819         if (le64toh(o->entry.seqnum) == needle)
1820                 return TEST_FOUND;
1821         else if (le64toh(o->entry.seqnum) < needle)
1822                 return TEST_LEFT;
1823         else
1824                 return TEST_RIGHT;
1825 }
1826
1827 int journal_file_move_to_entry_by_seqnum(
1828                 JournalFile *f,
1829                 uint64_t seqnum,
1830                 direction_t direction,
1831                 Object **ret,
1832                 uint64_t *offset) {
1833
1834         return generic_array_bisect(f,
1835                                     le64toh(f->header->entry_array_offset),
1836                                     le64toh(f->header->n_entries),
1837                                     seqnum,
1838                                     test_object_seqnum,
1839                                     direction,
1840                                     ret, offset, NULL);
1841 }
1842
1843 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1844         Object *o;
1845         int r;
1846
1847         assert(f);
1848         assert(p > 0);
1849
1850         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1851         if (r < 0)
1852                 return r;
1853
1854         if (le64toh(o->entry.realtime) == needle)
1855                 return TEST_FOUND;
1856         else if (le64toh(o->entry.realtime) < needle)
1857                 return TEST_LEFT;
1858         else
1859                 return TEST_RIGHT;
1860 }
1861
1862 int journal_file_move_to_entry_by_realtime(
1863                 JournalFile *f,
1864                 uint64_t realtime,
1865                 direction_t direction,
1866                 Object **ret,
1867                 uint64_t *offset) {
1868
1869         return generic_array_bisect(f,
1870                                     le64toh(f->header->entry_array_offset),
1871                                     le64toh(f->header->n_entries),
1872                                     realtime,
1873                                     test_object_realtime,
1874                                     direction,
1875                                     ret, offset, NULL);
1876 }
1877
1878 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1879         Object *o;
1880         int r;
1881
1882         assert(f);
1883         assert(p > 0);
1884
1885         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1886         if (r < 0)
1887                 return r;
1888
1889         if (le64toh(o->entry.monotonic) == needle)
1890                 return TEST_FOUND;
1891         else if (le64toh(o->entry.monotonic) < needle)
1892                 return TEST_LEFT;
1893         else
1894                 return TEST_RIGHT;
1895 }
1896
1897 static inline int find_data_object_by_boot_id(
1898                 JournalFile *f,
1899                 sd_id128_t boot_id,
1900                 Object **o,
1901                 uint64_t *b) {
1902         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1903
1904         sd_id128_to_string(boot_id, t + 9);
1905         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1906 }
1907
1908 int journal_file_move_to_entry_by_monotonic(
1909                 JournalFile *f,
1910                 sd_id128_t boot_id,
1911                 uint64_t monotonic,
1912                 direction_t direction,
1913                 Object **ret,
1914                 uint64_t *offset) {
1915
1916         Object *o;
1917         int r;
1918
1919         assert(f);
1920
1921         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1922         if (r < 0)
1923                 return r;
1924         if (r == 0)
1925                 return -ENOENT;
1926
1927         return generic_array_bisect_plus_one(f,
1928                                              le64toh(o->data.entry_offset),
1929                                              le64toh(o->data.entry_array_offset),
1930                                              le64toh(o->data.n_entries),
1931                                              monotonic,
1932                                              test_object_monotonic,
1933                                              direction,
1934                                              ret, offset, NULL);
1935 }
1936
1937 int journal_file_next_entry(
1938                 JournalFile *f,
1939                 Object *o, uint64_t p,
1940                 direction_t direction,
1941                 Object **ret, uint64_t *offset) {
1942
1943         uint64_t i, n, ofs;
1944         int r;
1945
1946         assert(f);
1947         assert(p > 0 || !o);
1948
1949         n = le64toh(f->header->n_entries);
1950         if (n <= 0)
1951                 return 0;
1952
1953         if (!o)
1954                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1955         else {
1956                 if (o->object.type != OBJECT_ENTRY)
1957                         return -EINVAL;
1958
1959                 r = generic_array_bisect(f,
1960                                          le64toh(f->header->entry_array_offset),
1961                                          le64toh(f->header->n_entries),
1962                                          p,
1963                                          test_object_offset,
1964                                          DIRECTION_DOWN,
1965                                          NULL, NULL,
1966                                          &i);
1967                 if (r <= 0)
1968                         return r;
1969
1970                 if (direction == DIRECTION_DOWN) {
1971                         if (i >= n - 1)
1972                                 return 0;
1973
1974                         i++;
1975                 } else {
1976                         if (i <= 0)
1977                                 return 0;
1978
1979                         i--;
1980                 }
1981         }
1982
1983         /* And jump to it */
1984         r = generic_array_get(f,
1985                               le64toh(f->header->entry_array_offset),
1986                               i,
1987                               ret, &ofs);
1988         if (r <= 0)
1989                 return r;
1990
1991         if (p > 0 &&
1992             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1993                 log_debug("%s: entry array corrupted at entry %"PRIu64,
1994                           f->path, i);
1995                 return -EBADMSG;
1996         }
1997
1998         if (offset)
1999                 *offset = ofs;
2000
2001         return 1;
2002 }
2003
2004 int journal_file_skip_entry(
2005                 JournalFile *f,
2006                 Object *o, uint64_t p,
2007                 int64_t skip,
2008                 Object **ret, uint64_t *offset) {
2009
2010         uint64_t i, n;
2011         int r;
2012
2013         assert(f);
2014         assert(o);
2015         assert(p > 0);
2016
2017         if (o->object.type != OBJECT_ENTRY)
2018                 return -EINVAL;
2019
2020         r = generic_array_bisect(f,
2021                                  le64toh(f->header->entry_array_offset),
2022                                  le64toh(f->header->n_entries),
2023                                  p,
2024                                  test_object_offset,
2025                                  DIRECTION_DOWN,
2026                                  NULL, NULL,
2027                                  &i);
2028         if (r <= 0)
2029                 return r;
2030
2031         /* Calculate new index */
2032         if (skip < 0) {
2033                 if ((uint64_t) -skip >= i)
2034                         i = 0;
2035                 else
2036                         i = i - (uint64_t) -skip;
2037         } else
2038                 i  += (uint64_t) skip;
2039
2040         n = le64toh(f->header->n_entries);
2041         if (n <= 0)
2042                 return -EBADMSG;
2043
2044         if (i >= n)
2045                 i = n-1;
2046
2047         return generic_array_get(f,
2048                                  le64toh(f->header->entry_array_offset),
2049                                  i,
2050                                  ret, offset);
2051 }
2052
2053 int journal_file_next_entry_for_data(
2054                 JournalFile *f,
2055                 Object *o, uint64_t p,
2056                 uint64_t data_offset,
2057                 direction_t direction,
2058                 Object **ret, uint64_t *offset) {
2059
2060         uint64_t n, i;
2061         int r;
2062         Object *d;
2063
2064         assert(f);
2065         assert(p > 0 || !o);
2066
2067         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2068         if (r < 0)
2069                 return r;
2070
2071         n = le64toh(d->data.n_entries);
2072         if (n <= 0)
2073                 return n;
2074
2075         if (!o)
2076                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2077         else {
2078                 if (o->object.type != OBJECT_ENTRY)
2079                         return -EINVAL;
2080
2081                 r = generic_array_bisect_plus_one(f,
2082                                                   le64toh(d->data.entry_offset),
2083                                                   le64toh(d->data.entry_array_offset),
2084                                                   le64toh(d->data.n_entries),
2085                                                   p,
2086                                                   test_object_offset,
2087                                                   DIRECTION_DOWN,
2088                                                   NULL, NULL,
2089                                                   &i);
2090
2091                 if (r <= 0)
2092                         return r;
2093
2094                 if (direction == DIRECTION_DOWN) {
2095                         if (i >= n - 1)
2096                                 return 0;
2097
2098                         i++;
2099                 } else {
2100                         if (i <= 0)
2101                                 return 0;
2102
2103                         i--;
2104                 }
2105
2106         }
2107
2108         return generic_array_get_plus_one(f,
2109                                           le64toh(d->data.entry_offset),
2110                                           le64toh(d->data.entry_array_offset),
2111                                           i,
2112                                           ret, offset);
2113 }
2114
2115 int journal_file_move_to_entry_by_offset_for_data(
2116                 JournalFile *f,
2117                 uint64_t data_offset,
2118                 uint64_t p,
2119                 direction_t direction,
2120                 Object **ret, uint64_t *offset) {
2121
2122         int r;
2123         Object *d;
2124
2125         assert(f);
2126
2127         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2128         if (r < 0)
2129                 return r;
2130
2131         return generic_array_bisect_plus_one(f,
2132                                              le64toh(d->data.entry_offset),
2133                                              le64toh(d->data.entry_array_offset),
2134                                              le64toh(d->data.n_entries),
2135                                              p,
2136                                              test_object_offset,
2137                                              direction,
2138                                              ret, offset, NULL);
2139 }
2140
2141 int journal_file_move_to_entry_by_monotonic_for_data(
2142                 JournalFile *f,
2143                 uint64_t data_offset,
2144                 sd_id128_t boot_id,
2145                 uint64_t monotonic,
2146                 direction_t direction,
2147                 Object **ret, uint64_t *offset) {
2148
2149         Object *o, *d;
2150         int r;
2151         uint64_t b, z;
2152
2153         assert(f);
2154
2155         /* First, seek by time */
2156         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2157         if (r < 0)
2158                 return r;
2159         if (r == 0)
2160                 return -ENOENT;
2161
2162         r = generic_array_bisect_plus_one(f,
2163                                           le64toh(o->data.entry_offset),
2164                                           le64toh(o->data.entry_array_offset),
2165                                           le64toh(o->data.n_entries),
2166                                           monotonic,
2167                                           test_object_monotonic,
2168                                           direction,
2169                                           NULL, &z, NULL);
2170         if (r <= 0)
2171                 return r;
2172
2173         /* And now, continue seeking until we find an entry that
2174          * exists in both bisection arrays */
2175
2176         for (;;) {
2177                 Object *qo;
2178                 uint64_t p, q;
2179
2180                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2181                 if (r < 0)
2182                         return r;
2183
2184                 r = generic_array_bisect_plus_one(f,
2185                                                   le64toh(d->data.entry_offset),
2186                                                   le64toh(d->data.entry_array_offset),
2187                                                   le64toh(d->data.n_entries),
2188                                                   z,
2189                                                   test_object_offset,
2190                                                   direction,
2191                                                   NULL, &p, NULL);
2192                 if (r <= 0)
2193                         return r;
2194
2195                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2196                 if (r < 0)
2197                         return r;
2198
2199                 r = generic_array_bisect_plus_one(f,
2200                                                   le64toh(o->data.entry_offset),
2201                                                   le64toh(o->data.entry_array_offset),
2202                                                   le64toh(o->data.n_entries),
2203                                                   p,
2204                                                   test_object_offset,
2205                                                   direction,
2206                                                   &qo, &q, NULL);
2207
2208                 if (r <= 0)
2209                         return r;
2210
2211                 if (p == q) {
2212                         if (ret)
2213                                 *ret = qo;
2214                         if (offset)
2215                                 *offset = q;
2216
2217                         return 1;
2218                 }
2219
2220                 z = q;
2221         }
2222 }
2223
2224 int journal_file_move_to_entry_by_seqnum_for_data(
2225                 JournalFile *f,
2226                 uint64_t data_offset,
2227                 uint64_t seqnum,
2228                 direction_t direction,
2229                 Object **ret, uint64_t *offset) {
2230
2231         Object *d;
2232         int r;
2233
2234         assert(f);
2235
2236         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2237         if (r < 0)
2238                 return r;
2239
2240         return generic_array_bisect_plus_one(f,
2241                                              le64toh(d->data.entry_offset),
2242                                              le64toh(d->data.entry_array_offset),
2243                                              le64toh(d->data.n_entries),
2244                                              seqnum,
2245                                              test_object_seqnum,
2246                                              direction,
2247                                              ret, offset, NULL);
2248 }
2249
2250 int journal_file_move_to_entry_by_realtime_for_data(
2251                 JournalFile *f,
2252                 uint64_t data_offset,
2253                 uint64_t realtime,
2254                 direction_t direction,
2255                 Object **ret, uint64_t *offset) {
2256
2257         Object *d;
2258         int r;
2259
2260         assert(f);
2261
2262         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2263         if (r < 0)
2264                 return r;
2265
2266         return generic_array_bisect_plus_one(f,
2267                                              le64toh(d->data.entry_offset),
2268                                              le64toh(d->data.entry_array_offset),
2269                                              le64toh(d->data.n_entries),
2270                                              realtime,
2271                                              test_object_realtime,
2272                                              direction,
2273                                              ret, offset, NULL);
2274 }
2275
2276 void journal_file_dump(JournalFile *f) {
2277         Object *o;
2278         int r;
2279         uint64_t p;
2280
2281         assert(f);
2282
2283         journal_file_print_header(f);
2284
2285         p = le64toh(f->header->header_size);
2286         while (p != 0) {
2287                 r = journal_file_move_to_object(f, -1, p, &o);
2288                 if (r < 0)
2289                         goto fail;
2290
2291                 switch (o->object.type) {
2292
2293                 case OBJECT_UNUSED:
2294                         printf("Type: OBJECT_UNUSED\n");
2295                         break;
2296
2297                 case OBJECT_DATA:
2298                         printf("Type: OBJECT_DATA\n");
2299                         break;
2300
2301                 case OBJECT_FIELD:
2302                         printf("Type: OBJECT_FIELD\n");
2303                         break;
2304
2305                 case OBJECT_ENTRY:
2306                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2307                                le64toh(o->entry.seqnum),
2308                                le64toh(o->entry.monotonic),
2309                                le64toh(o->entry.realtime));
2310                         break;
2311
2312                 case OBJECT_FIELD_HASH_TABLE:
2313                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2314                         break;
2315
2316                 case OBJECT_DATA_HASH_TABLE:
2317                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2318                         break;
2319
2320                 case OBJECT_ENTRY_ARRAY:
2321                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2322                         break;
2323
2324                 case OBJECT_TAG:
2325                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2326                                le64toh(o->tag.seqnum),
2327                                le64toh(o->tag.epoch));
2328                         break;
2329
2330                 default:
2331                         printf("Type: unknown (%u)\n", o->object.type);
2332                         break;
2333                 }
2334
2335                 if (o->object.flags & OBJECT_COMPRESSED)
2336                         printf("Flags: COMPRESSED\n");
2337
2338                 if (p == le64toh(f->header->tail_object_offset))
2339                         p = 0;
2340                 else
2341                         p = p + ALIGN64(le64toh(o->object.size));
2342         }
2343
2344         return;
2345 fail:
2346         log_error("File corrupt");
2347 }
2348
2349 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2350         const char *x;
2351
2352         x = format_timestamp(buf, l, t);
2353         if (x)
2354                 return x;
2355         return " --- ";
2356 }
2357
2358 void journal_file_print_header(JournalFile *f) {
2359         char a[33], b[33], c[33], d[33];
2360         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2361         struct stat st;
2362         char bytes[FORMAT_BYTES_MAX];
2363
2364         assert(f);
2365
2366         printf("File Path: %s\n"
2367                "File ID: %s\n"
2368                "Machine ID: %s\n"
2369                "Boot ID: %s\n"
2370                "Sequential Number ID: %s\n"
2371                "State: %s\n"
2372                "Compatible Flags:%s%s\n"
2373                "Incompatible Flags:%s%s\n"
2374                "Header size: %"PRIu64"\n"
2375                "Arena size: %"PRIu64"\n"
2376                "Data Hash Table Size: %"PRIu64"\n"
2377                "Field Hash Table Size: %"PRIu64"\n"
2378                "Rotate Suggested: %s\n"
2379                "Head Sequential Number: %"PRIu64"\n"
2380                "Tail Sequential Number: %"PRIu64"\n"
2381                "Head Realtime Timestamp: %s\n"
2382                "Tail Realtime Timestamp: %s\n"
2383                "Tail Monotonic Timestamp: %s\n"
2384                "Objects: %"PRIu64"\n"
2385                "Entry Objects: %"PRIu64"\n",
2386                f->path,
2387                sd_id128_to_string(f->header->file_id, a),
2388                sd_id128_to_string(f->header->machine_id, b),
2389                sd_id128_to_string(f->header->boot_id, c),
2390                sd_id128_to_string(f->header->seqnum_id, d),
2391                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2392                f->header->state == STATE_ONLINE ? "ONLINE" :
2393                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2394                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2395                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2396                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2397                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2398                le64toh(f->header->header_size),
2399                le64toh(f->header->arena_size),
2400                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2401                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2402                yes_no(journal_file_rotate_suggested(f, 0)),
2403                le64toh(f->header->head_entry_seqnum),
2404                le64toh(f->header->tail_entry_seqnum),
2405                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2406                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2407                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2408                le64toh(f->header->n_objects),
2409                le64toh(f->header->n_entries));
2410
2411         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2412                 printf("Data Objects: %"PRIu64"\n"
2413                        "Data Hash Table Fill: %.1f%%\n",
2414                        le64toh(f->header->n_data),
2415                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2416
2417         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2418                 printf("Field Objects: %"PRIu64"\n"
2419                        "Field Hash Table Fill: %.1f%%\n",
2420                        le64toh(f->header->n_fields),
2421                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2422
2423         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2424                 printf("Tag Objects: %"PRIu64"\n",
2425                        le64toh(f->header->n_tags));
2426         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2427                 printf("Entry Array Objects: %"PRIu64"\n",
2428                        le64toh(f->header->n_entry_arrays));
2429
2430         if (fstat(f->fd, &st) >= 0)
2431                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2432 }
2433
2434 int journal_file_open(
2435                 const char *fname,
2436                 int flags,
2437                 mode_t mode,
2438                 bool compress,
2439                 bool seal,
2440                 JournalMetrics *metrics,
2441                 MMapCache *mmap_cache,
2442                 JournalFile *template,
2443                 JournalFile **ret) {
2444
2445         JournalFile *f;
2446         int r;
2447         bool newly_created = false;
2448
2449         assert(fname);
2450         assert(ret);
2451
2452         if ((flags & O_ACCMODE) != O_RDONLY &&
2453             (flags & O_ACCMODE) != O_RDWR)
2454                 return -EINVAL;
2455
2456         if (!endswith(fname, ".journal") &&
2457             !endswith(fname, ".journal~"))
2458                 return -EINVAL;
2459
2460         f = new0(JournalFile, 1);
2461         if (!f)
2462                 return -ENOMEM;
2463
2464         f->fd = -1;
2465         f->mode = mode;
2466
2467         f->flags = flags;
2468         f->prot = prot_from_flags(flags);
2469         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2470 #ifdef HAVE_XZ
2471         f->compress = compress;
2472 #endif
2473 #ifdef HAVE_GCRYPT
2474         f->seal = seal;
2475 #endif
2476
2477         if (mmap_cache)
2478                 f->mmap = mmap_cache_ref(mmap_cache);
2479         else {
2480                 f->mmap = mmap_cache_new();
2481                 if (!f->mmap) {
2482                         r = -ENOMEM;
2483                         goto fail;
2484                 }
2485         }
2486
2487         f->path = strdup(fname);
2488         if (!f->path) {
2489                 r = -ENOMEM;
2490                 goto fail;
2491         }
2492
2493         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2494         if (!f->chain_cache) {
2495                 r = -ENOMEM;
2496                 goto fail;
2497         }
2498
2499         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2500         if (f->fd < 0) {
2501                 r = -errno;
2502                 goto fail;
2503         }
2504
2505         if (fstat(f->fd, &f->last_stat) < 0) {
2506                 r = -errno;
2507                 goto fail;
2508         }
2509
2510         if (f->last_stat.st_size == 0 && f->writable) {
2511                 uint64_t crtime;
2512
2513                 /* Let's attach the creation time to the journal file,
2514                  * so that the vacuuming code knows the age of this
2515                  * file even if the file might end up corrupted one
2516                  * day... Ideally we'd just use the creation time many
2517                  * file systems maintain for each file, but there is
2518                  * currently no usable API to query this, hence let's
2519                  * emulate this via extended attributes. If extended
2520                  * attributes are not supported we'll just skip this,
2521                  * and rely solely on mtime/atime/ctime of the file.*/
2522
2523                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2524                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2525
2526 #ifdef HAVE_GCRYPT
2527                 /* Try to load the FSPRG state, and if we can't, then
2528                  * just don't do sealing */
2529                 if (f->seal) {
2530                         r = journal_file_fss_load(f);
2531                         if (r < 0)
2532                                 f->seal = false;
2533                 }
2534 #endif
2535
2536                 r = journal_file_init_header(f, template);
2537                 if (r < 0)
2538                         goto fail;
2539
2540                 if (fstat(f->fd, &f->last_stat) < 0) {
2541                         r = -errno;
2542                         goto fail;
2543                 }
2544
2545                 newly_created = true;
2546         }
2547
2548         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2549                 r = -EIO;
2550                 goto fail;
2551         }
2552
2553         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2554         if (f->header == MAP_FAILED) {
2555                 f->header = NULL;
2556                 r = -errno;
2557                 goto fail;
2558         }
2559
2560         if (!newly_created) {
2561                 r = journal_file_verify_header(f);
2562                 if (r < 0)
2563                         goto fail;
2564         }
2565
2566 #ifdef HAVE_GCRYPT
2567         if (!newly_created && f->writable) {
2568                 r = journal_file_fss_load(f);
2569                 if (r < 0)
2570                         goto fail;
2571         }
2572 #endif
2573
2574         if (f->writable) {
2575                 if (metrics) {
2576                         journal_default_metrics(metrics, f->fd);
2577                         f->metrics = *metrics;
2578                 } else if (template)
2579                         f->metrics = template->metrics;
2580
2581                 r = journal_file_refresh_header(f);
2582                 if (r < 0)
2583                         goto fail;
2584         }
2585
2586 #ifdef HAVE_GCRYPT
2587         r = journal_file_hmac_setup(f);
2588         if (r < 0)
2589                 goto fail;
2590 #endif
2591
2592         if (newly_created) {
2593                 r = journal_file_setup_field_hash_table(f);
2594                 if (r < 0)
2595                         goto fail;
2596
2597                 r = journal_file_setup_data_hash_table(f);
2598                 if (r < 0)
2599                         goto fail;
2600
2601 #ifdef HAVE_GCRYPT
2602                 r = journal_file_append_first_tag(f);
2603                 if (r < 0)
2604                         goto fail;
2605 #endif
2606         }
2607
2608         r = journal_file_map_field_hash_table(f);
2609         if (r < 0)
2610                 goto fail;
2611
2612         r = journal_file_map_data_hash_table(f);
2613         if (r < 0)
2614                 goto fail;
2615
2616         *ret = f;
2617         return 0;
2618
2619 fail:
2620         journal_file_close(f);
2621
2622         return r;
2623 }
2624
2625 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2626         _cleanup_free_ char *p = NULL;
2627         size_t l;
2628         JournalFile *old_file, *new_file = NULL;
2629         int r;
2630
2631         assert(f);
2632         assert(*f);
2633
2634         old_file = *f;
2635
2636         if (!old_file->writable)
2637                 return -EINVAL;
2638
2639         if (!endswith(old_file->path, ".journal"))
2640                 return -EINVAL;
2641
2642         l = strlen(old_file->path);
2643         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2644                      (int) l - 8, old_file->path,
2645                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2646                      le64toh((*f)->header->head_entry_seqnum),
2647                      le64toh((*f)->header->head_entry_realtime));
2648         if (r < 0)
2649                 return -ENOMEM;
2650
2651         r = rename(old_file->path, p);
2652         if (r < 0)
2653                 return -errno;
2654
2655         old_file->header->state = STATE_ARCHIVED;
2656
2657         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2658         journal_file_close(old_file);
2659
2660         *f = new_file;
2661         return r;
2662 }
2663
2664 int journal_file_open_reliably(
2665                 const char *fname,
2666                 int flags,
2667                 mode_t mode,
2668                 bool compress,
2669                 bool seal,
2670                 JournalMetrics *metrics,
2671                 MMapCache *mmap_cache,
2672                 JournalFile *template,
2673                 JournalFile **ret) {
2674
2675         int r;
2676         size_t l;
2677         _cleanup_free_ char *p = NULL;
2678
2679         r = journal_file_open(fname, flags, mode, compress, seal,
2680                               metrics, mmap_cache, template, ret);
2681         if (r != -EBADMSG && /* corrupted */
2682             r != -ENODATA && /* truncated */
2683             r != -EHOSTDOWN && /* other machine */
2684             r != -EPROTONOSUPPORT && /* incompatible feature */
2685             r != -EBUSY && /* unclean shutdown */
2686             r != -ESHUTDOWN /* already archived */)
2687                 return r;
2688
2689         if ((flags & O_ACCMODE) == O_RDONLY)
2690                 return r;
2691
2692         if (!(flags & O_CREAT))
2693                 return r;
2694
2695         if (!endswith(fname, ".journal"))
2696                 return r;
2697
2698         /* The file is corrupted. Rotate it away and try it again (but only once) */
2699
2700         l = strlen(fname);
2701         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2702                      (int) l - 8, fname,
2703                      (unsigned long long) now(CLOCK_REALTIME),
2704                      random_u64()) < 0)
2705                 return -ENOMEM;
2706
2707         r = rename(fname, p);
2708         if (r < 0)
2709                 return -errno;
2710
2711         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2712
2713         return journal_file_open(fname, flags, mode, compress, seal,
2714                                  metrics, mmap_cache, template, ret);
2715 }
2716
2717 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2718         uint64_t i, n;
2719         uint64_t q, xor_hash = 0;
2720         int r;
2721         EntryItem *items;
2722         dual_timestamp ts;
2723
2724         assert(from);
2725         assert(to);
2726         assert(o);
2727         assert(p);
2728
2729         if (!to->writable)
2730                 return -EPERM;
2731
2732         ts.monotonic = le64toh(o->entry.monotonic);
2733         ts.realtime = le64toh(o->entry.realtime);
2734
2735         n = journal_file_entry_n_items(o);
2736         /* alloca() can't take 0, hence let's allocate at least one */
2737         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2738
2739         for (i = 0; i < n; i++) {
2740                 uint64_t l, h;
2741                 le64_t le_hash;
2742                 size_t t;
2743                 void *data;
2744                 Object *u;
2745
2746                 q = le64toh(o->entry.items[i].object_offset);
2747                 le_hash = o->entry.items[i].hash;
2748
2749                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2750                 if (r < 0)
2751                         return r;
2752
2753                 if (le_hash != o->data.hash)
2754                         return -EBADMSG;
2755
2756                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2757                 t = (size_t) l;
2758
2759                 /* We hit the limit on 32bit machines */
2760                 if ((uint64_t) t != l)
2761                         return -E2BIG;
2762
2763                 if (o->object.flags & OBJECT_COMPRESSED) {
2764 #ifdef HAVE_XZ
2765                         uint64_t rsize;
2766
2767                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2768                                 return -EBADMSG;
2769
2770                         data = from->compress_buffer;
2771                         l = rsize;
2772 #else
2773                         return -EPROTONOSUPPORT;
2774 #endif
2775                 } else
2776                         data = o->data.payload;
2777
2778                 r = journal_file_append_data(to, data, l, &u, &h);
2779                 if (r < 0)
2780                         return r;
2781
2782                 xor_hash ^= le64toh(u->data.hash);
2783                 items[i].object_offset = htole64(h);
2784                 items[i].hash = u->data.hash;
2785
2786                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2787                 if (r < 0)
2788                         return r;
2789         }
2790
2791         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2792 }
2793
2794 void journal_default_metrics(JournalMetrics *m, int fd) {
2795         uint64_t fs_size = 0;
2796         struct statvfs ss;
2797         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2798
2799         assert(m);
2800         assert(fd >= 0);
2801
2802         if (fstatvfs(fd, &ss) >= 0)
2803                 fs_size = ss.f_frsize * ss.f_blocks;
2804
2805         if (m->max_use == (uint64_t) -1) {
2806
2807                 if (fs_size > 0) {
2808                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2809
2810                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2811                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2812
2813                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2814                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2815                 } else
2816                         m->max_use = DEFAULT_MAX_USE_LOWER;
2817         } else {
2818                 m->max_use = PAGE_ALIGN(m->max_use);
2819
2820                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2821                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2822         }
2823
2824         if (m->max_size == (uint64_t) -1) {
2825                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2826
2827                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2828                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2829         } else
2830                 m->max_size = PAGE_ALIGN(m->max_size);
2831
2832         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2833                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2834
2835         if (m->max_size*2 > m->max_use)
2836                 m->max_use = m->max_size*2;
2837
2838         if (m->min_size == (uint64_t) -1)
2839                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2840         else {
2841                 m->min_size = PAGE_ALIGN(m->min_size);
2842
2843                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2844                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2845
2846                 if (m->min_size > m->max_size)
2847                         m->max_size = m->min_size;
2848         }
2849
2850         if (m->keep_free == (uint64_t) -1) {
2851
2852                 if (fs_size > 0) {
2853                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2854
2855                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2856                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2857
2858                 } else
2859                         m->keep_free = DEFAULT_KEEP_FREE;
2860         }
2861
2862         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2863                   format_bytes(a, sizeof(a), m->max_use),
2864                   format_bytes(b, sizeof(b), m->max_size),
2865                   format_bytes(c, sizeof(c), m->min_size),
2866                   format_bytes(d, sizeof(d), m->keep_free));
2867 }
2868
2869 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2870         assert(f);
2871         assert(from || to);
2872
2873         if (from) {
2874                 if (f->header->head_entry_realtime == 0)
2875                         return -ENOENT;
2876
2877                 *from = le64toh(f->header->head_entry_realtime);
2878         }
2879
2880         if (to) {
2881                 if (f->header->tail_entry_realtime == 0)
2882                         return -ENOENT;
2883
2884                 *to = le64toh(f->header->tail_entry_realtime);
2885         }
2886
2887         return 1;
2888 }
2889
2890 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2891         Object *o;
2892         uint64_t p;
2893         int r;
2894
2895         assert(f);
2896         assert(from || to);
2897
2898         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2899         if (r <= 0)
2900                 return r;
2901
2902         if (le64toh(o->data.n_entries) <= 0)
2903                 return 0;
2904
2905         if (from) {
2906                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2907                 if (r < 0)
2908                         return r;
2909
2910                 *from = le64toh(o->entry.monotonic);
2911         }
2912
2913         if (to) {
2914                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2915                 if (r < 0)
2916                         return r;
2917
2918                 r = generic_array_get_plus_one(f,
2919                                                le64toh(o->data.entry_offset),
2920                                                le64toh(o->data.entry_array_offset),
2921                                                le64toh(o->data.n_entries)-1,
2922                                                &o, NULL);
2923                 if (r <= 0)
2924                         return r;
2925
2926                 *to = le64toh(o->entry.monotonic);
2927         }
2928
2929         return 1;
2930 }
2931
2932 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2933         assert(f);
2934
2935         /* If we gained new header fields we gained new features,
2936          * hence suggest a rotation */
2937         if (le64toh(f->header->header_size) < sizeof(Header)) {
2938                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2939                 return true;
2940         }
2941
2942         /* Let's check if the hash tables grew over a certain fill
2943          * level (75%, borrowing this value from Java's hash table
2944          * implementation), and if so suggest a rotation. To calculate
2945          * the fill level we need the n_data field, which only exists
2946          * in newer versions. */
2947
2948         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2949                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2950                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2951                                   f->path,
2952                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2953                                   le64toh(f->header->n_data),
2954                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2955                                   (unsigned long long) f->last_stat.st_size,
2956                                   f->last_stat.st_size / le64toh(f->header->n_data));
2957                         return true;
2958                 }
2959
2960         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2961                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2962                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2963                                   f->path,
2964                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2965                                   le64toh(f->header->n_fields),
2966                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2967                         return true;
2968                 }
2969
2970         /* Are the data objects properly indexed by field objects? */
2971         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2972             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2973             le64toh(f->header->n_data) > 0 &&
2974             le64toh(f->header->n_fields) == 0)
2975                 return true;
2976
2977         if (max_file_usec > 0) {
2978                 usec_t t, h;
2979
2980                 h = le64toh(f->header->head_entry_realtime);
2981                 t = now(CLOCK_REALTIME);
2982
2983                 if (h > 0 && t > h + max_file_usec)
2984                         return true;
2985         }
2986
2987         return false;
2988 }