chiark / gitweb /
util: replace close_nointr_nofail() by a more useful safe_close()
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75         assert(f);
76
77         if (!f->writable)
78                 return -EPERM;
79
80         if (!(f->fd >= 0 && f->header))
81                 return -EINVAL;
82
83         switch(f->header->state) {
84                 case STATE_ONLINE:
85                         return 0;
86
87                 case STATE_OFFLINE:
88                         f->header->state = STATE_ONLINE;
89                         fsync(f->fd);
90                         return 0;
91
92                 default:
93                         return -EINVAL;
94         }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98         assert(f);
99
100         if (!f->writable)
101                 return -EPERM;
102
103         if (!(f->fd >= 0 && f->header))
104                 return -EINVAL;
105
106         if (f->header->state != STATE_ONLINE)
107                 return 0;
108
109         fsync(f->fd);
110
111         f->header->state = STATE_OFFLINE;
112
113         fsync(f->fd);
114
115         return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119         assert(f);
120
121 #ifdef HAVE_GCRYPT
122         /* Write the final tag */
123         if (f->seal && f->writable)
124                 journal_file_append_tag(f);
125 #endif
126
127         /* Sync everything to disk, before we mark the file offline */
128         if (f->mmap && f->fd >= 0)
129                 mmap_cache_close_fd(f->mmap, f->fd);
130
131         journal_file_set_offline(f);
132
133         if (f->header)
134                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136         safe_close(f->fd);
137         free(f->path);
138
139         if (f->mmap)
140                 mmap_cache_unref(f->mmap);
141
142         hashmap_free_free(f->chain_cache);
143
144 #ifdef HAVE_XZ
145         free(f->compress_buffer);
146 #endif
147
148 #ifdef HAVE_GCRYPT
149         if (f->fss_file)
150                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
151         else if (f->fsprg_state)
152                 free(f->fsprg_state);
153
154         free(f->fsprg_seed);
155
156         if (f->hmac)
157                 gcry_md_close(f->hmac);
158 #endif
159
160         free(f);
161 }
162
163 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
164         Header h;
165         ssize_t k;
166         int r;
167
168         assert(f);
169
170         zero(h);
171         memcpy(h.signature, HEADER_SIGNATURE, 8);
172         h.header_size = htole64(ALIGN64(sizeof(h)));
173
174         h.incompatible_flags =
175                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
176
177         h.compatible_flags =
178                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
179
180         r = sd_id128_randomize(&h.file_id);
181         if (r < 0)
182                 return r;
183
184         if (template) {
185                 h.seqnum_id = template->header->seqnum_id;
186                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
187         } else
188                 h.seqnum_id = h.file_id;
189
190         k = pwrite(f->fd, &h, sizeof(h), 0);
191         if (k < 0)
192                 return -errno;
193
194         if (k != sizeof(h))
195                 return -EIO;
196
197         return 0;
198 }
199
200 static int journal_file_refresh_header(JournalFile *f) {
201         int r;
202         sd_id128_t boot_id;
203
204         assert(f);
205
206         r = sd_id128_get_machine(&f->header->machine_id);
207         if (r < 0)
208                 return r;
209
210         r = sd_id128_get_boot(&boot_id);
211         if (r < 0)
212                 return r;
213
214         if (sd_id128_equal(boot_id, f->header->boot_id))
215                 f->tail_entry_monotonic_valid = true;
216
217         f->header->boot_id = boot_id;
218
219         journal_file_set_online(f);
220
221         /* Sync the online state to disk */
222         fsync(f->fd);
223
224         return 0;
225 }
226
227 static int journal_file_verify_header(JournalFile *f) {
228         assert(f);
229
230         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
231                 return -EBADMSG;
232
233         /* In both read and write mode we refuse to open files with
234          * incompatible flags we don't know */
235 #ifdef HAVE_XZ
236         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
237                 return -EPROTONOSUPPORT;
238 #else
239         if (f->header->incompatible_flags != 0)
240                 return -EPROTONOSUPPORT;
241 #endif
242
243         /* When open for writing we refuse to open files with
244          * compatible flags, too */
245         if (f->writable) {
246 #ifdef HAVE_GCRYPT
247                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
248                         return -EPROTONOSUPPORT;
249 #else
250                 if (f->header->compatible_flags != 0)
251                         return -EPROTONOSUPPORT;
252 #endif
253         }
254
255         if (f->header->state >= _STATE_MAX)
256                 return -EBADMSG;
257
258         /* The first addition was n_data, so check that we are at least this large */
259         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
260                 return -EBADMSG;
261
262         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
263                 return -EBADMSG;
264
265         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
266                 return -ENODATA;
267
268         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
269                 return -ENODATA;
270
271         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
272             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
273             !VALID64(le64toh(f->header->tail_object_offset)) ||
274             !VALID64(le64toh(f->header->entry_array_offset)))
275                 return -ENODATA;
276
277         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
278             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
279             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
281                 return -ENODATA;
282
283         if (f->writable) {
284                 uint8_t state;
285                 sd_id128_t machine_id;
286                 int r;
287
288                 r = sd_id128_get_machine(&machine_id);
289                 if (r < 0)
290                         return r;
291
292                 if (!sd_id128_equal(machine_id, f->header->machine_id))
293                         return -EHOSTDOWN;
294
295                 state = f->header->state;
296
297                 if (state == STATE_ONLINE) {
298                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
299                         return -EBUSY;
300                 } else if (state == STATE_ARCHIVED)
301                         return -ESHUTDOWN;
302                 else if (state != STATE_OFFLINE) {
303                         log_debug("Journal file %s has unknown state %u.", f->path, state);
304                         return -EBUSY;
305                 }
306         }
307
308         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
309
310         f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312         return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316         uint64_t old_size, new_size;
317         int r;
318
319         assert(f);
320
321         /* We assume that this file is not sparse, and we know that
322          * for sure, since we always call posix_fallocate()
323          * ourselves */
324
325         old_size =
326                 le64toh(f->header->header_size) +
327                 le64toh(f->header->arena_size);
328
329         new_size = PAGE_ALIGN(offset + size);
330         if (new_size < le64toh(f->header->header_size))
331                 new_size = le64toh(f->header->header_size);
332
333         if (new_size <= old_size)
334                 return 0;
335
336         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
337                 return -E2BIG;
338
339         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
340                 struct statvfs svfs;
341
342                 if (fstatvfs(f->fd, &svfs) >= 0) {
343                         uint64_t available;
344
345                         available = svfs.f_bfree * svfs.f_bsize;
346
347                         if (available >= f->metrics.keep_free)
348                                 available -= f->metrics.keep_free;
349                         else
350                                 available = 0;
351
352                         if (new_size - old_size > available)
353                                 return -E2BIG;
354                 }
355         }
356
357         /* Increase by larger blocks at once */
358         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360                 new_size = f->metrics.max_size;
361
362         /* Note that the glibc fallocate() fallback is very
363            inefficient, hence we try to minimize the allocation area
364            as we can. */
365         r = posix_fallocate(f->fd, old_size, new_size - old_size);
366         if (r != 0)
367                 return -r;
368
369         if (fstat(f->fd, &f->last_stat) < 0)
370                 return -errno;
371
372         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
373
374         return 0;
375 }
376
377 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
378         assert(f);
379         assert(ret);
380
381         if (size <= 0)
382                 return -EINVAL;
383
384         /* Avoid SIGBUS on invalid accesses */
385         if (offset + size > (uint64_t) f->last_stat.st_size) {
386                 /* Hmm, out of range? Let's refresh the fstat() data
387                  * first, before we trust that check. */
388
389                 if (fstat(f->fd, &f->last_stat) < 0 ||
390                     offset + size > (uint64_t) f->last_stat.st_size)
391                         return -EADDRNOTAVAIL;
392         }
393
394         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
395 }
396
397 static uint64_t minimum_header_size(Object *o) {
398
399         static const uint64_t table[] = {
400                 [OBJECT_DATA] = sizeof(DataObject),
401                 [OBJECT_FIELD] = sizeof(FieldObject),
402                 [OBJECT_ENTRY] = sizeof(EntryObject),
403                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406                 [OBJECT_TAG] = sizeof(TagObject),
407         };
408
409         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410                 return sizeof(ObjectHeader);
411
412         return table[o->object.type];
413 }
414
415 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
416         int r;
417         void *t;
418         Object *o;
419         uint64_t s;
420
421         assert(f);
422         assert(ret);
423
424         /* Objects may only be located at multiple of 64 bit */
425         if (!VALID64(offset))
426                 return -EFAULT;
427
428
429         r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
430         if (r < 0)
431                 return r;
432
433         o = (Object*) t;
434         s = le64toh(o->object.size);
435
436         if (s < sizeof(ObjectHeader))
437                 return -EBADMSG;
438
439         if (o->object.type <= OBJECT_UNUSED)
440                 return -EBADMSG;
441
442         if (s < minimum_header_size(o))
443                 return -EBADMSG;
444
445         if (type > 0 && o->object.type != type)
446                 return -EBADMSG;
447
448         if (s > sizeof(ObjectHeader)) {
449                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
450                 if (r < 0)
451                         return r;
452
453                 o = (Object*) t;
454         }
455
456         *ret = o;
457         return 0;
458 }
459
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
461         uint64_t r;
462
463         assert(f);
464
465         r = le64toh(f->header->tail_entry_seqnum) + 1;
466
467         if (seqnum) {
468                 /* If an external seqnum counter was passed, we update
469                  * both the local and the external one, and set it to
470                  * the maximum of both */
471
472                 if (*seqnum + 1 > r)
473                         r = *seqnum + 1;
474
475                 *seqnum = r;
476         }
477
478         f->header->tail_entry_seqnum = htole64(r);
479
480         if (f->header->head_entry_seqnum == 0)
481                 f->header->head_entry_seqnum = htole64(r);
482
483         return r;
484 }
485
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
487         int r;
488         uint64_t p;
489         Object *tail, *o;
490         void *t;
491
492         assert(f);
493         assert(type > 0 && type < _OBJECT_TYPE_MAX);
494         assert(size >= sizeof(ObjectHeader));
495         assert(offset);
496         assert(ret);
497
498         r = journal_file_set_online(f);
499         if (r < 0)
500                 return r;
501
502         p = le64toh(f->header->tail_object_offset);
503         if (p == 0)
504                 p = le64toh(f->header->header_size);
505         else {
506                 r = journal_file_move_to_object(f, -1, p, &tail);
507                 if (r < 0)
508                         return r;
509
510                 p += ALIGN64(le64toh(tail->object.size));
511         }
512
513         r = journal_file_allocate(f, p, size);
514         if (r < 0)
515                 return r;
516
517         r = journal_file_move_to(f, type, false, p, size, &t);
518         if (r < 0)
519                 return r;
520
521         o = (Object*) t;
522
523         zero(o->object);
524         o->object.type = type;
525         o->object.size = htole64(size);
526
527         f->header->tail_object_offset = htole64(p);
528         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
529
530         *ret = o;
531         *offset = p;
532
533         return 0;
534 }
535
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
537         uint64_t s, p;
538         Object *o;
539         int r;
540
541         assert(f);
542
543         /* We estimate that we need 1 hash table entry per 768 of
544            journal file and we want to make sure we never get beyond
545            75% fill level. Calculate the hash table size for the
546            maximum file size based on these metrics. */
547
548         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
551
552         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
553
554         r = journal_file_append_object(f,
555                                        OBJECT_DATA_HASH_TABLE,
556                                        offsetof(Object, hash_table.items) + s,
557                                        &o, &p);
558         if (r < 0)
559                 return r;
560
561         memzero(o->hash_table.items, s);
562
563         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564         f->header->data_hash_table_size = htole64(s);
565
566         return 0;
567 }
568
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
570         uint64_t s, p;
571         Object *o;
572         int r;
573
574         assert(f);
575
576         /* We use a fixed size hash table for the fields as this
577          * number should grow very slowly only */
578
579         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580         r = journal_file_append_object(f,
581                                        OBJECT_FIELD_HASH_TABLE,
582                                        offsetof(Object, hash_table.items) + s,
583                                        &o, &p);
584         if (r < 0)
585                 return r;
586
587         memzero(o->hash_table.items, s);
588
589         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590         f->header->field_hash_table_size = htole64(s);
591
592         return 0;
593 }
594
595 static int journal_file_map_data_hash_table(JournalFile *f) {
596         uint64_t s, p;
597         void *t;
598         int r;
599
600         assert(f);
601
602         p = le64toh(f->header->data_hash_table_offset);
603         s = le64toh(f->header->data_hash_table_size);
604
605         r = journal_file_move_to(f,
606                                  OBJECT_DATA_HASH_TABLE,
607                                  true,
608                                  p, s,
609                                  &t);
610         if (r < 0)
611                 return r;
612
613         f->data_hash_table = t;
614         return 0;
615 }
616
617 static int journal_file_map_field_hash_table(JournalFile *f) {
618         uint64_t s, p;
619         void *t;
620         int r;
621
622         assert(f);
623
624         p = le64toh(f->header->field_hash_table_offset);
625         s = le64toh(f->header->field_hash_table_size);
626
627         r = journal_file_move_to(f,
628                                  OBJECT_FIELD_HASH_TABLE,
629                                  true,
630                                  p, s,
631                                  &t);
632         if (r < 0)
633                 return r;
634
635         f->field_hash_table = t;
636         return 0;
637 }
638
639 static int journal_file_link_field(
640                 JournalFile *f,
641                 Object *o,
642                 uint64_t offset,
643                 uint64_t hash) {
644
645         uint64_t p, h;
646         int r;
647
648         assert(f);
649         assert(o);
650         assert(offset > 0);
651
652         if (o->object.type != OBJECT_FIELD)
653                 return -EINVAL;
654
655         /* This might alter the window we are looking at */
656
657         o->field.next_hash_offset = o->field.head_data_offset = 0;
658
659         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->field_hash_table[h].tail_hash_offset);
661         if (p == 0)
662                 f->field_hash_table[h].head_hash_offset = htole64(offset);
663         else {
664                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
665                 if (r < 0)
666                         return r;
667
668                 o->field.next_hash_offset = htole64(offset);
669         }
670
671         f->field_hash_table[h].tail_hash_offset = htole64(offset);
672
673         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
675
676         return 0;
677 }
678
679 static int journal_file_link_data(
680                 JournalFile *f,
681                 Object *o,
682                 uint64_t offset,
683                 uint64_t hash) {
684
685         uint64_t p, h;
686         int r;
687
688         assert(f);
689         assert(o);
690         assert(offset > 0);
691
692         if (o->object.type != OBJECT_DATA)
693                 return -EINVAL;
694
695         /* This might alter the window we are looking at */
696
697         o->data.next_hash_offset = o->data.next_field_offset = 0;
698         o->data.entry_offset = o->data.entry_array_offset = 0;
699         o->data.n_entries = 0;
700
701         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702         p = le64toh(f->data_hash_table[h].tail_hash_offset);
703         if (p == 0)
704                 /* Only entry in the hash table is easy */
705                 f->data_hash_table[h].head_hash_offset = htole64(offset);
706         else {
707                 /* Move back to the previous data object, to patch in
708                  * pointer */
709
710                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
711                 if (r < 0)
712                         return r;
713
714                 o->data.next_hash_offset = htole64(offset);
715         }
716
717         f->data_hash_table[h].tail_hash_offset = htole64(offset);
718
719         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
721
722         return 0;
723 }
724
725 int journal_file_find_field_object_with_hash(
726                 JournalFile *f,
727                 const void *field, uint64_t size, uint64_t hash,
728                 Object **ret, uint64_t *offset) {
729
730         uint64_t p, osize, h;
731         int r;
732
733         assert(f);
734         assert(field && size > 0);
735
736         osize = offsetof(Object, field.payload) + size;
737
738         if (f->header->field_hash_table_size == 0)
739                 return -EBADMSG;
740
741         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742         p = le64toh(f->field_hash_table[h].head_hash_offset);
743
744         while (p > 0) {
745                 Object *o;
746
747                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
748                 if (r < 0)
749                         return r;
750
751                 if (le64toh(o->field.hash) == hash &&
752                     le64toh(o->object.size) == osize &&
753                     memcmp(o->field.payload, field, size) == 0) {
754
755                         if (ret)
756                                 *ret = o;
757                         if (offset)
758                                 *offset = p;
759
760                         return 1;
761                 }
762
763                 p = le64toh(o->field.next_hash_offset);
764         }
765
766         return 0;
767 }
768
769 int journal_file_find_field_object(
770                 JournalFile *f,
771                 const void *field, uint64_t size,
772                 Object **ret, uint64_t *offset) {
773
774         uint64_t hash;
775
776         assert(f);
777         assert(field && size > 0);
778
779         hash = hash64(field, size);
780
781         return journal_file_find_field_object_with_hash(f,
782                                                         field, size, hash,
783                                                         ret, offset);
784 }
785
786 int journal_file_find_data_object_with_hash(
787                 JournalFile *f,
788                 const void *data, uint64_t size, uint64_t hash,
789                 Object **ret, uint64_t *offset) {
790
791         uint64_t p, osize, h;
792         int r;
793
794         assert(f);
795         assert(data || size == 0);
796
797         osize = offsetof(Object, data.payload) + size;
798
799         if (f->header->data_hash_table_size == 0)
800                 return -EBADMSG;
801
802         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803         p = le64toh(f->data_hash_table[h].head_hash_offset);
804
805         while (p > 0) {
806                 Object *o;
807
808                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
809                 if (r < 0)
810                         return r;
811
812                 if (le64toh(o->data.hash) != hash)
813                         goto next;
814
815                 if (o->object.flags & OBJECT_COMPRESSED) {
816 #ifdef HAVE_XZ
817                         uint64_t l, rsize;
818
819                         l = le64toh(o->object.size);
820                         if (l <= offsetof(Object, data.payload))
821                                 return -EBADMSG;
822
823                         l -= offsetof(Object, data.payload);
824
825                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
826                                 return -EBADMSG;
827
828                         if (rsize == size &&
829                             memcmp(f->compress_buffer, data, size) == 0) {
830
831                                 if (ret)
832                                         *ret = o;
833
834                                 if (offset)
835                                         *offset = p;
836
837                                 return 1;
838                         }
839 #else
840                         return -EPROTONOSUPPORT;
841 #endif
842
843                 } else if (le64toh(o->object.size) == osize &&
844                            memcmp(o->data.payload, data, size) == 0) {
845
846                         if (ret)
847                                 *ret = o;
848
849                         if (offset)
850                                 *offset = p;
851
852                         return 1;
853                 }
854
855         next:
856                 p = le64toh(o->data.next_hash_offset);
857         }
858
859         return 0;
860 }
861
862 int journal_file_find_data_object(
863                 JournalFile *f,
864                 const void *data, uint64_t size,
865                 Object **ret, uint64_t *offset) {
866
867         uint64_t hash;
868
869         assert(f);
870         assert(data || size == 0);
871
872         hash = hash64(data, size);
873
874         return journal_file_find_data_object_with_hash(f,
875                                                        data, size, hash,
876                                                        ret, offset);
877 }
878
879 static int journal_file_append_field(
880                 JournalFile *f,
881                 const void *field, uint64_t size,
882                 Object **ret, uint64_t *offset) {
883
884         uint64_t hash, p;
885         uint64_t osize;
886         Object *o;
887         int r;
888
889         assert(f);
890         assert(field && size > 0);
891
892         hash = hash64(field, size);
893
894         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
895         if (r < 0)
896                 return r;
897         else if (r > 0) {
898
899                 if (ret)
900                         *ret = o;
901
902                 if (offset)
903                         *offset = p;
904
905                 return 0;
906         }
907
908         osize = offsetof(Object, field.payload) + size;
909         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
910         if (r < 0)
911                 return r;
912
913         o->field.hash = htole64(hash);
914         memcpy(o->field.payload, field, size);
915
916         r = journal_file_link_field(f, o, p, hash);
917         if (r < 0)
918                 return r;
919
920         /* The linking might have altered the window, so let's
921          * refresh our pointer */
922         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
923         if (r < 0)
924                 return r;
925
926 #ifdef HAVE_GCRYPT
927         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
928         if (r < 0)
929                 return r;
930 #endif
931
932         if (ret)
933                 *ret = o;
934
935         if (offset)
936                 *offset = p;
937
938         return 0;
939 }
940
941 static int journal_file_append_data(
942                 JournalFile *f,
943                 const void *data, uint64_t size,
944                 Object **ret, uint64_t *offset) {
945
946         uint64_t hash, p;
947         uint64_t osize;
948         Object *o;
949         int r;
950         bool compressed = false;
951         const void *eq;
952
953         assert(f);
954         assert(data || size == 0);
955
956         hash = hash64(data, size);
957
958         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
959         if (r < 0)
960                 return r;
961         else if (r > 0) {
962
963                 if (ret)
964                         *ret = o;
965
966                 if (offset)
967                         *offset = p;
968
969                 return 0;
970         }
971
972         osize = offsetof(Object, data.payload) + size;
973         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
974         if (r < 0)
975                 return r;
976
977         o->data.hash = htole64(hash);
978
979 #ifdef HAVE_XZ
980         if (f->compress &&
981             size >= COMPRESSION_SIZE_THRESHOLD) {
982                 uint64_t rsize;
983
984                 compressed = compress_blob(data, size, o->data.payload, &rsize);
985
986                 if (compressed) {
987                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
988                         o->object.flags |= OBJECT_COMPRESSED;
989
990                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
991                 }
992         }
993 #endif
994
995         if (!compressed && size > 0)
996                 memcpy(o->data.payload, data, size);
997
998         r = journal_file_link_data(f, o, p, hash);
999         if (r < 0)
1000                 return r;
1001
1002         /* The linking might have altered the window, so let's
1003          * refresh our pointer */
1004         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1005         if (r < 0)
1006                 return r;
1007
1008         if (!data)
1009                 eq = NULL;
1010         else
1011                 eq = memchr(data, '=', size);
1012         if (eq && eq > data) {
1013                 Object *fo = NULL;
1014                 uint64_t fp;
1015
1016                 /* Create field object ... */
1017                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1018                 if (r < 0)
1019                         return r;
1020
1021                 /* ... and link it in. */
1022                 o->data.next_field_offset = fo->field.head_data_offset;
1023                 fo->field.head_data_offset = le64toh(p);
1024         }
1025
1026 #ifdef HAVE_GCRYPT
1027         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1028         if (r < 0)
1029                 return r;
1030 #endif
1031
1032         if (ret)
1033                 *ret = o;
1034
1035         if (offset)
1036                 *offset = p;
1037
1038         return 0;
1039 }
1040
1041 uint64_t journal_file_entry_n_items(Object *o) {
1042         assert(o);
1043
1044         if (o->object.type != OBJECT_ENTRY)
1045                 return 0;
1046
1047         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1048 }
1049
1050 uint64_t journal_file_entry_array_n_items(Object *o) {
1051         assert(o);
1052
1053         if (o->object.type != OBJECT_ENTRY_ARRAY)
1054                 return 0;
1055
1056         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1057 }
1058
1059 uint64_t journal_file_hash_table_n_items(Object *o) {
1060         assert(o);
1061
1062         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1063             o->object.type != OBJECT_FIELD_HASH_TABLE)
1064                 return 0;
1065
1066         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1067 }
1068
1069 static int link_entry_into_array(JournalFile *f,
1070                                  le64_t *first,
1071                                  le64_t *idx,
1072                                  uint64_t p) {
1073         int r;
1074         uint64_t n = 0, ap = 0, q, i, a, hidx;
1075         Object *o;
1076
1077         assert(f);
1078         assert(first);
1079         assert(idx);
1080         assert(p > 0);
1081
1082         a = le64toh(*first);
1083         i = hidx = le64toh(*idx);
1084         while (a > 0) {
1085
1086                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1087                 if (r < 0)
1088                         return r;
1089
1090                 n = journal_file_entry_array_n_items(o);
1091                 if (i < n) {
1092                         o->entry_array.items[i] = htole64(p);
1093                         *idx = htole64(hidx + 1);
1094                         return 0;
1095                 }
1096
1097                 i -= n;
1098                 ap = a;
1099                 a = le64toh(o->entry_array.next_entry_array_offset);
1100         }
1101
1102         if (hidx > n)
1103                 n = (hidx+1) * 2;
1104         else
1105                 n = n * 2;
1106
1107         if (n < 4)
1108                 n = 4;
1109
1110         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1111                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1112                                        &o, &q);
1113         if (r < 0)
1114                 return r;
1115
1116 #ifdef HAVE_GCRYPT
1117         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1118         if (r < 0)
1119                 return r;
1120 #endif
1121
1122         o->entry_array.items[i] = htole64(p);
1123
1124         if (ap == 0)
1125                 *first = htole64(q);
1126         else {
1127                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1128                 if (r < 0)
1129                         return r;
1130
1131                 o->entry_array.next_entry_array_offset = htole64(q);
1132         }
1133
1134         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1135                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1136
1137         *idx = htole64(hidx + 1);
1138
1139         return 0;
1140 }
1141
1142 static int link_entry_into_array_plus_one(JournalFile *f,
1143                                           le64_t *extra,
1144                                           le64_t *first,
1145                                           le64_t *idx,
1146                                           uint64_t p) {
1147
1148         int r;
1149
1150         assert(f);
1151         assert(extra);
1152         assert(first);
1153         assert(idx);
1154         assert(p > 0);
1155
1156         if (*idx == 0)
1157                 *extra = htole64(p);
1158         else {
1159                 le64_t i;
1160
1161                 i = htole64(le64toh(*idx) - 1);
1162                 r = link_entry_into_array(f, first, &i, p);
1163                 if (r < 0)
1164                         return r;
1165         }
1166
1167         *idx = htole64(le64toh(*idx) + 1);
1168         return 0;
1169 }
1170
1171 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1172         uint64_t p;
1173         int r;
1174         assert(f);
1175         assert(o);
1176         assert(offset > 0);
1177
1178         p = le64toh(o->entry.items[i].object_offset);
1179         if (p == 0)
1180                 return -EINVAL;
1181
1182         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1183         if (r < 0)
1184                 return r;
1185
1186         return link_entry_into_array_plus_one(f,
1187                                               &o->data.entry_offset,
1188                                               &o->data.entry_array_offset,
1189                                               &o->data.n_entries,
1190                                               offset);
1191 }
1192
1193 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1194         uint64_t n, i;
1195         int r;
1196
1197         assert(f);
1198         assert(o);
1199         assert(offset > 0);
1200
1201         if (o->object.type != OBJECT_ENTRY)
1202                 return -EINVAL;
1203
1204         __sync_synchronize();
1205
1206         /* Link up the entry itself */
1207         r = link_entry_into_array(f,
1208                                   &f->header->entry_array_offset,
1209                                   &f->header->n_entries,
1210                                   offset);
1211         if (r < 0)
1212                 return r;
1213
1214         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1215
1216         if (f->header->head_entry_realtime == 0)
1217                 f->header->head_entry_realtime = o->entry.realtime;
1218
1219         f->header->tail_entry_realtime = o->entry.realtime;
1220         f->header->tail_entry_monotonic = o->entry.monotonic;
1221
1222         f->tail_entry_monotonic_valid = true;
1223
1224         /* Link up the items */
1225         n = journal_file_entry_n_items(o);
1226         for (i = 0; i < n; i++) {
1227                 r = journal_file_link_entry_item(f, o, offset, i);
1228                 if (r < 0)
1229                         return r;
1230         }
1231
1232         return 0;
1233 }
1234
1235 static int journal_file_append_entry_internal(
1236                 JournalFile *f,
1237                 const dual_timestamp *ts,
1238                 uint64_t xor_hash,
1239                 const EntryItem items[], unsigned n_items,
1240                 uint64_t *seqnum,
1241                 Object **ret, uint64_t *offset) {
1242         uint64_t np;
1243         uint64_t osize;
1244         Object *o;
1245         int r;
1246
1247         assert(f);
1248         assert(items || n_items == 0);
1249         assert(ts);
1250
1251         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1252
1253         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1254         if (r < 0)
1255                 return r;
1256
1257         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1258         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1259         o->entry.realtime = htole64(ts->realtime);
1260         o->entry.monotonic = htole64(ts->monotonic);
1261         o->entry.xor_hash = htole64(xor_hash);
1262         o->entry.boot_id = f->header->boot_id;
1263
1264 #ifdef HAVE_GCRYPT
1265         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1266         if (r < 0)
1267                 return r;
1268 #endif
1269
1270         r = journal_file_link_entry(f, o, np);
1271         if (r < 0)
1272                 return r;
1273
1274         if (ret)
1275                 *ret = o;
1276
1277         if (offset)
1278                 *offset = np;
1279
1280         return 0;
1281 }
1282
1283 void journal_file_post_change(JournalFile *f) {
1284         assert(f);
1285
1286         /* inotify() does not receive IN_MODIFY events from file
1287          * accesses done via mmap(). After each access we hence
1288          * trigger IN_MODIFY by truncating the journal file to its
1289          * current size which triggers IN_MODIFY. */
1290
1291         __sync_synchronize();
1292
1293         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1294                 log_error("Failed to truncate file to its own size: %m");
1295 }
1296
1297 static int entry_item_cmp(const void *_a, const void *_b) {
1298         const EntryItem *a = _a, *b = _b;
1299
1300         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1301                 return -1;
1302         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1303                 return 1;
1304         return 0;
1305 }
1306
1307 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1308         unsigned i;
1309         EntryItem *items;
1310         int r;
1311         uint64_t xor_hash = 0;
1312         struct dual_timestamp _ts;
1313
1314         assert(f);
1315         assert(iovec || n_iovec == 0);
1316
1317         if (!ts) {
1318                 dual_timestamp_get(&_ts);
1319                 ts = &_ts;
1320         }
1321
1322         if (f->tail_entry_monotonic_valid &&
1323             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1324                 return -EINVAL;
1325
1326 #ifdef HAVE_GCRYPT
1327         r = journal_file_maybe_append_tag(f, ts->realtime);
1328         if (r < 0)
1329                 return r;
1330 #endif
1331
1332         /* alloca() can't take 0, hence let's allocate at least one */
1333         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1334
1335         for (i = 0; i < n_iovec; i++) {
1336                 uint64_t p;
1337                 Object *o;
1338
1339                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1340                 if (r < 0)
1341                         return r;
1342
1343                 xor_hash ^= le64toh(o->data.hash);
1344                 items[i].object_offset = htole64(p);
1345                 items[i].hash = o->data.hash;
1346         }
1347
1348         /* Order by the position on disk, in order to improve seek
1349          * times for rotating media. */
1350         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1351
1352         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1353
1354         journal_file_post_change(f);
1355
1356         return r;
1357 }
1358
1359 typedef struct ChainCacheItem {
1360         uint64_t first; /* the array at the beginning of the chain */
1361         uint64_t array; /* the cached array */
1362         uint64_t begin; /* the first item in the cached array */
1363         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1364         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1365 } ChainCacheItem;
1366
1367 static void chain_cache_put(
1368                 Hashmap *h,
1369                 ChainCacheItem *ci,
1370                 uint64_t first,
1371                 uint64_t array,
1372                 uint64_t begin,
1373                 uint64_t total,
1374                 uint64_t last_index) {
1375
1376         if (!ci) {
1377                 /* If the chain item to cache for this chain is the
1378                  * first one it's not worth caching anything */
1379                 if (array == first)
1380                         return;
1381
1382                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1383                         ci = hashmap_steal_first(h);
1384                 else {
1385                         ci = new(ChainCacheItem, 1);
1386                         if (!ci)
1387                                 return;
1388                 }
1389
1390                 ci->first = first;
1391
1392                 if (hashmap_put(h, &ci->first, ci) < 0) {
1393                         free(ci);
1394                         return;
1395                 }
1396         } else
1397                 assert(ci->first == first);
1398
1399         ci->array = array;
1400         ci->begin = begin;
1401         ci->total = total;
1402         ci->last_index = last_index;
1403 }
1404
1405 static int generic_array_get(
1406                 JournalFile *f,
1407                 uint64_t first,
1408                 uint64_t i,
1409                 Object **ret, uint64_t *offset) {
1410
1411         Object *o;
1412         uint64_t p = 0, a, t = 0;
1413         int r;
1414         ChainCacheItem *ci;
1415
1416         assert(f);
1417
1418         a = first;
1419
1420         /* Try the chain cache first */
1421         ci = hashmap_get(f->chain_cache, &first);
1422         if (ci && i > ci->total) {
1423                 a = ci->array;
1424                 i -= ci->total;
1425                 t = ci->total;
1426         }
1427
1428         while (a > 0) {
1429                 uint64_t k;
1430
1431                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1432                 if (r < 0)
1433                         return r;
1434
1435                 k = journal_file_entry_array_n_items(o);
1436                 if (i < k) {
1437                         p = le64toh(o->entry_array.items[i]);
1438                         goto found;
1439                 }
1440
1441                 i -= k;
1442                 t += k;
1443                 a = le64toh(o->entry_array.next_entry_array_offset);
1444         }
1445
1446         return 0;
1447
1448 found:
1449         /* Let's cache this item for the next invocation */
1450         chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1451
1452         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1453         if (r < 0)
1454                 return r;
1455
1456         if (ret)
1457                 *ret = o;
1458
1459         if (offset)
1460                 *offset = p;
1461
1462         return 1;
1463 }
1464
1465 static int generic_array_get_plus_one(
1466                 JournalFile *f,
1467                 uint64_t extra,
1468                 uint64_t first,
1469                 uint64_t i,
1470                 Object **ret, uint64_t *offset) {
1471
1472         Object *o;
1473
1474         assert(f);
1475
1476         if (i == 0) {
1477                 int r;
1478
1479                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1480                 if (r < 0)
1481                         return r;
1482
1483                 if (ret)
1484                         *ret = o;
1485
1486                 if (offset)
1487                         *offset = extra;
1488
1489                 return 1;
1490         }
1491
1492         return generic_array_get(f, first, i-1, ret, offset);
1493 }
1494
1495 enum {
1496         TEST_FOUND,
1497         TEST_LEFT,
1498         TEST_RIGHT
1499 };
1500
1501 static int generic_array_bisect(
1502                 JournalFile *f,
1503                 uint64_t first,
1504                 uint64_t n,
1505                 uint64_t needle,
1506                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1507                 direction_t direction,
1508                 Object **ret,
1509                 uint64_t *offset,
1510                 uint64_t *idx) {
1511
1512         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1513         bool subtract_one = false;
1514         Object *o, *array = NULL;
1515         int r;
1516         ChainCacheItem *ci;
1517
1518         assert(f);
1519         assert(test_object);
1520
1521         /* Start with the first array in the chain */
1522         a = first;
1523
1524         ci = hashmap_get(f->chain_cache, &first);
1525         if (ci && n > ci->total) {
1526                 /* Ah, we have iterated this bisection array chain
1527                  * previously! Let's see if we can skip ahead in the
1528                  * chain, as far as the last time. But we can't jump
1529                  * backwards in the chain, so let's check that
1530                  * first. */
1531
1532                 r = test_object(f, ci->begin, needle);
1533                 if (r < 0)
1534                         return r;
1535
1536                 if (r == TEST_LEFT) {
1537                         /* OK, what we are looking for is right of the
1538                          * begin of this EntryArray, so let's jump
1539                          * straight to previously cached array in the
1540                          * chain */
1541
1542                         a = ci->array;
1543                         n -= ci->total;
1544                         t = ci->total;
1545                         last_index = ci->last_index;
1546                 }
1547         }
1548
1549         while (a > 0) {
1550                 uint64_t left, right, k, lp;
1551
1552                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1553                 if (r < 0)
1554                         return r;
1555
1556                 k = journal_file_entry_array_n_items(array);
1557                 right = MIN(k, n);
1558                 if (right <= 0)
1559                         return 0;
1560
1561                 i = right - 1;
1562                 lp = p = le64toh(array->entry_array.items[i]);
1563                 if (p <= 0)
1564                         return -EBADMSG;
1565
1566                 r = test_object(f, p, needle);
1567                 if (r < 0)
1568                         return r;
1569
1570                 if (r == TEST_FOUND)
1571                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1572
1573                 if (r == TEST_RIGHT) {
1574                         left = 0;
1575                         right -= 1;
1576
1577                         if (last_index != (uint64_t) -1) {
1578                                 assert(last_index <= right);
1579
1580                                 /* If we cached the last index we
1581                                  * looked at, let's try to not to jump
1582                                  * too wildly around and see if we can
1583                                  * limit the range to look at early to
1584                                  * the immediate neighbors of the last
1585                                  * index we looked at. */
1586
1587                                 if (last_index > 0) {
1588                                         uint64_t x = last_index - 1;
1589
1590                                         p = le64toh(array->entry_array.items[x]);
1591                                         if (p <= 0)
1592                                                 return -EBADMSG;
1593
1594                                         r = test_object(f, p, needle);
1595                                         if (r < 0)
1596                                                 return r;
1597
1598                                         if (r == TEST_FOUND)
1599                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1600
1601                                         if (r == TEST_RIGHT)
1602                                                 right = x;
1603                                         else
1604                                                 left = x + 1;
1605                                 }
1606
1607                                 if (last_index < right) {
1608                                         uint64_t y = last_index + 1;
1609
1610                                         p = le64toh(array->entry_array.items[y]);
1611                                         if (p <= 0)
1612                                                 return -EBADMSG;
1613
1614                                         r = test_object(f, p, needle);
1615                                         if (r < 0)
1616                                                 return r;
1617
1618                                         if (r == TEST_FOUND)
1619                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1620
1621                                         if (r == TEST_RIGHT)
1622                                                 right = y;
1623                                         else
1624                                                 left = y + 1;
1625                                 }
1626                         }
1627
1628                         for (;;) {
1629                                 if (left == right) {
1630                                         if (direction == DIRECTION_UP)
1631                                                 subtract_one = true;
1632
1633                                         i = left;
1634                                         goto found;
1635                                 }
1636
1637                                 assert(left < right);
1638                                 i = (left + right) / 2;
1639
1640                                 p = le64toh(array->entry_array.items[i]);
1641                                 if (p <= 0)
1642                                         return -EBADMSG;
1643
1644                                 r = test_object(f, p, needle);
1645                                 if (r < 0)
1646                                         return r;
1647
1648                                 if (r == TEST_FOUND)
1649                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1650
1651                                 if (r == TEST_RIGHT)
1652                                         right = i;
1653                                 else
1654                                         left = i + 1;
1655                         }
1656                 }
1657
1658                 if (k > n) {
1659                         if (direction == DIRECTION_UP) {
1660                                 i = n;
1661                                 subtract_one = true;
1662                                 goto found;
1663                         }
1664
1665                         return 0;
1666                 }
1667
1668                 last_p = lp;
1669
1670                 n -= k;
1671                 t += k;
1672                 last_index = (uint64_t) -1;
1673                 a = le64toh(array->entry_array.next_entry_array_offset);
1674         }
1675
1676         return 0;
1677
1678 found:
1679         if (subtract_one && t == 0 && i == 0)
1680                 return 0;
1681
1682         /* Let's cache this item for the next invocation */
1683         chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1684
1685         if (subtract_one && i == 0)
1686                 p = last_p;
1687         else if (subtract_one)
1688                 p = le64toh(array->entry_array.items[i-1]);
1689         else
1690                 p = le64toh(array->entry_array.items[i]);
1691
1692         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1693         if (r < 0)
1694                 return r;
1695
1696         if (ret)
1697                 *ret = o;
1698
1699         if (offset)
1700                 *offset = p;
1701
1702         if (idx)
1703                 *idx = t + i + (subtract_one ? -1 : 0);
1704
1705         return 1;
1706 }
1707
1708
1709 static int generic_array_bisect_plus_one(
1710                 JournalFile *f,
1711                 uint64_t extra,
1712                 uint64_t first,
1713                 uint64_t n,
1714                 uint64_t needle,
1715                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1716                 direction_t direction,
1717                 Object **ret,
1718                 uint64_t *offset,
1719                 uint64_t *idx) {
1720
1721         int r;
1722         bool step_back = false;
1723         Object *o;
1724
1725         assert(f);
1726         assert(test_object);
1727
1728         if (n <= 0)
1729                 return 0;
1730
1731         /* This bisects the array in object 'first', but first checks
1732          * an extra  */
1733         r = test_object(f, extra, needle);
1734         if (r < 0)
1735                 return r;
1736
1737         if (r == TEST_FOUND)
1738                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1739
1740         /* if we are looking with DIRECTION_UP then we need to first
1741            see if in the actual array there is a matching entry, and
1742            return the last one of that. But if there isn't any we need
1743            to return this one. Hence remember this, and return it
1744            below. */
1745         if (r == TEST_LEFT)
1746                 step_back = direction == DIRECTION_UP;
1747
1748         if (r == TEST_RIGHT) {
1749                 if (direction == DIRECTION_DOWN)
1750                         goto found;
1751                 else
1752                         return 0;
1753         }
1754
1755         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1756
1757         if (r == 0 && step_back)
1758                 goto found;
1759
1760         if (r > 0 && idx)
1761                 (*idx) ++;
1762
1763         return r;
1764
1765 found:
1766         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1767         if (r < 0)
1768                 return r;
1769
1770         if (ret)
1771                 *ret = o;
1772
1773         if (offset)
1774                 *offset = extra;
1775
1776         if (idx)
1777                 *idx = 0;
1778
1779         return 1;
1780 }
1781
1782 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1783         assert(f);
1784         assert(p > 0);
1785
1786         if (p == needle)
1787                 return TEST_FOUND;
1788         else if (p < needle)
1789                 return TEST_LEFT;
1790         else
1791                 return TEST_RIGHT;
1792 }
1793
1794 int journal_file_move_to_entry_by_offset(
1795                 JournalFile *f,
1796                 uint64_t p,
1797                 direction_t direction,
1798                 Object **ret,
1799                 uint64_t *offset) {
1800
1801         return generic_array_bisect(f,
1802                                     le64toh(f->header->entry_array_offset),
1803                                     le64toh(f->header->n_entries),
1804                                     p,
1805                                     test_object_offset,
1806                                     direction,
1807                                     ret, offset, NULL);
1808 }
1809
1810
1811 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1812         Object *o;
1813         int r;
1814
1815         assert(f);
1816         assert(p > 0);
1817
1818         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1819         if (r < 0)
1820                 return r;
1821
1822         if (le64toh(o->entry.seqnum) == needle)
1823                 return TEST_FOUND;
1824         else if (le64toh(o->entry.seqnum) < needle)
1825                 return TEST_LEFT;
1826         else
1827                 return TEST_RIGHT;
1828 }
1829
1830 int journal_file_move_to_entry_by_seqnum(
1831                 JournalFile *f,
1832                 uint64_t seqnum,
1833                 direction_t direction,
1834                 Object **ret,
1835                 uint64_t *offset) {
1836
1837         return generic_array_bisect(f,
1838                                     le64toh(f->header->entry_array_offset),
1839                                     le64toh(f->header->n_entries),
1840                                     seqnum,
1841                                     test_object_seqnum,
1842                                     direction,
1843                                     ret, offset, NULL);
1844 }
1845
1846 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1847         Object *o;
1848         int r;
1849
1850         assert(f);
1851         assert(p > 0);
1852
1853         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1854         if (r < 0)
1855                 return r;
1856
1857         if (le64toh(o->entry.realtime) == needle)
1858                 return TEST_FOUND;
1859         else if (le64toh(o->entry.realtime) < needle)
1860                 return TEST_LEFT;
1861         else
1862                 return TEST_RIGHT;
1863 }
1864
1865 int journal_file_move_to_entry_by_realtime(
1866                 JournalFile *f,
1867                 uint64_t realtime,
1868                 direction_t direction,
1869                 Object **ret,
1870                 uint64_t *offset) {
1871
1872         return generic_array_bisect(f,
1873                                     le64toh(f->header->entry_array_offset),
1874                                     le64toh(f->header->n_entries),
1875                                     realtime,
1876                                     test_object_realtime,
1877                                     direction,
1878                                     ret, offset, NULL);
1879 }
1880
1881 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1882         Object *o;
1883         int r;
1884
1885         assert(f);
1886         assert(p > 0);
1887
1888         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1889         if (r < 0)
1890                 return r;
1891
1892         if (le64toh(o->entry.monotonic) == needle)
1893                 return TEST_FOUND;
1894         else if (le64toh(o->entry.monotonic) < needle)
1895                 return TEST_LEFT;
1896         else
1897                 return TEST_RIGHT;
1898 }
1899
1900 static inline int find_data_object_by_boot_id(
1901                 JournalFile *f,
1902                 sd_id128_t boot_id,
1903                 Object **o,
1904                 uint64_t *b) {
1905         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1906
1907         sd_id128_to_string(boot_id, t + 9);
1908         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1909 }
1910
1911 int journal_file_move_to_entry_by_monotonic(
1912                 JournalFile *f,
1913                 sd_id128_t boot_id,
1914                 uint64_t monotonic,
1915                 direction_t direction,
1916                 Object **ret,
1917                 uint64_t *offset) {
1918
1919         Object *o;
1920         int r;
1921
1922         assert(f);
1923
1924         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1925         if (r < 0)
1926                 return r;
1927         if (r == 0)
1928                 return -ENOENT;
1929
1930         return generic_array_bisect_plus_one(f,
1931                                              le64toh(o->data.entry_offset),
1932                                              le64toh(o->data.entry_array_offset),
1933                                              le64toh(o->data.n_entries),
1934                                              monotonic,
1935                                              test_object_monotonic,
1936                                              direction,
1937                                              ret, offset, NULL);
1938 }
1939
1940 int journal_file_next_entry(
1941                 JournalFile *f,
1942                 Object *o, uint64_t p,
1943                 direction_t direction,
1944                 Object **ret, uint64_t *offset) {
1945
1946         uint64_t i, n, ofs;
1947         int r;
1948
1949         assert(f);
1950         assert(p > 0 || !o);
1951
1952         n = le64toh(f->header->n_entries);
1953         if (n <= 0)
1954                 return 0;
1955
1956         if (!o)
1957                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1958         else {
1959                 if (o->object.type != OBJECT_ENTRY)
1960                         return -EINVAL;
1961
1962                 r = generic_array_bisect(f,
1963                                          le64toh(f->header->entry_array_offset),
1964                                          le64toh(f->header->n_entries),
1965                                          p,
1966                                          test_object_offset,
1967                                          DIRECTION_DOWN,
1968                                          NULL, NULL,
1969                                          &i);
1970                 if (r <= 0)
1971                         return r;
1972
1973                 if (direction == DIRECTION_DOWN) {
1974                         if (i >= n - 1)
1975                                 return 0;
1976
1977                         i++;
1978                 } else {
1979                         if (i <= 0)
1980                                 return 0;
1981
1982                         i--;
1983                 }
1984         }
1985
1986         /* And jump to it */
1987         r = generic_array_get(f,
1988                               le64toh(f->header->entry_array_offset),
1989                               i,
1990                               ret, &ofs);
1991         if (r <= 0)
1992                 return r;
1993
1994         if (p > 0 &&
1995             (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1996                 log_debug("%s: entry array corrupted at entry %"PRIu64,
1997                           f->path, i);
1998                 return -EBADMSG;
1999         }
2000
2001         if (offset)
2002                 *offset = ofs;
2003
2004         return 1;
2005 }
2006
2007 int journal_file_skip_entry(
2008                 JournalFile *f,
2009                 Object *o, uint64_t p,
2010                 int64_t skip,
2011                 Object **ret, uint64_t *offset) {
2012
2013         uint64_t i, n;
2014         int r;
2015
2016         assert(f);
2017         assert(o);
2018         assert(p > 0);
2019
2020         if (o->object.type != OBJECT_ENTRY)
2021                 return -EINVAL;
2022
2023         r = generic_array_bisect(f,
2024                                  le64toh(f->header->entry_array_offset),
2025                                  le64toh(f->header->n_entries),
2026                                  p,
2027                                  test_object_offset,
2028                                  DIRECTION_DOWN,
2029                                  NULL, NULL,
2030                                  &i);
2031         if (r <= 0)
2032                 return r;
2033
2034         /* Calculate new index */
2035         if (skip < 0) {
2036                 if ((uint64_t) -skip >= i)
2037                         i = 0;
2038                 else
2039                         i = i - (uint64_t) -skip;
2040         } else
2041                 i  += (uint64_t) skip;
2042
2043         n = le64toh(f->header->n_entries);
2044         if (n <= 0)
2045                 return -EBADMSG;
2046
2047         if (i >= n)
2048                 i = n-1;
2049
2050         return generic_array_get(f,
2051                                  le64toh(f->header->entry_array_offset),
2052                                  i,
2053                                  ret, offset);
2054 }
2055
2056 int journal_file_next_entry_for_data(
2057                 JournalFile *f,
2058                 Object *o, uint64_t p,
2059                 uint64_t data_offset,
2060                 direction_t direction,
2061                 Object **ret, uint64_t *offset) {
2062
2063         uint64_t n, i;
2064         int r;
2065         Object *d;
2066
2067         assert(f);
2068         assert(p > 0 || !o);
2069
2070         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2071         if (r < 0)
2072                 return r;
2073
2074         n = le64toh(d->data.n_entries);
2075         if (n <= 0)
2076                 return n;
2077
2078         if (!o)
2079                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2080         else {
2081                 if (o->object.type != OBJECT_ENTRY)
2082                         return -EINVAL;
2083
2084                 r = generic_array_bisect_plus_one(f,
2085                                                   le64toh(d->data.entry_offset),
2086                                                   le64toh(d->data.entry_array_offset),
2087                                                   le64toh(d->data.n_entries),
2088                                                   p,
2089                                                   test_object_offset,
2090                                                   DIRECTION_DOWN,
2091                                                   NULL, NULL,
2092                                                   &i);
2093
2094                 if (r <= 0)
2095                         return r;
2096
2097                 if (direction == DIRECTION_DOWN) {
2098                         if (i >= n - 1)
2099                                 return 0;
2100
2101                         i++;
2102                 } else {
2103                         if (i <= 0)
2104                                 return 0;
2105
2106                         i--;
2107                 }
2108
2109         }
2110
2111         return generic_array_get_plus_one(f,
2112                                           le64toh(d->data.entry_offset),
2113                                           le64toh(d->data.entry_array_offset),
2114                                           i,
2115                                           ret, offset);
2116 }
2117
2118 int journal_file_move_to_entry_by_offset_for_data(
2119                 JournalFile *f,
2120                 uint64_t data_offset,
2121                 uint64_t p,
2122                 direction_t direction,
2123                 Object **ret, uint64_t *offset) {
2124
2125         int r;
2126         Object *d;
2127
2128         assert(f);
2129
2130         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2131         if (r < 0)
2132                 return r;
2133
2134         return generic_array_bisect_plus_one(f,
2135                                              le64toh(d->data.entry_offset),
2136                                              le64toh(d->data.entry_array_offset),
2137                                              le64toh(d->data.n_entries),
2138                                              p,
2139                                              test_object_offset,
2140                                              direction,
2141                                              ret, offset, NULL);
2142 }
2143
2144 int journal_file_move_to_entry_by_monotonic_for_data(
2145                 JournalFile *f,
2146                 uint64_t data_offset,
2147                 sd_id128_t boot_id,
2148                 uint64_t monotonic,
2149                 direction_t direction,
2150                 Object **ret, uint64_t *offset) {
2151
2152         Object *o, *d;
2153         int r;
2154         uint64_t b, z;
2155
2156         assert(f);
2157
2158         /* First, seek by time */
2159         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2160         if (r < 0)
2161                 return r;
2162         if (r == 0)
2163                 return -ENOENT;
2164
2165         r = generic_array_bisect_plus_one(f,
2166                                           le64toh(o->data.entry_offset),
2167                                           le64toh(o->data.entry_array_offset),
2168                                           le64toh(o->data.n_entries),
2169                                           monotonic,
2170                                           test_object_monotonic,
2171                                           direction,
2172                                           NULL, &z, NULL);
2173         if (r <= 0)
2174                 return r;
2175
2176         /* And now, continue seeking until we find an entry that
2177          * exists in both bisection arrays */
2178
2179         for (;;) {
2180                 Object *qo;
2181                 uint64_t p, q;
2182
2183                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2184                 if (r < 0)
2185                         return r;
2186
2187                 r = generic_array_bisect_plus_one(f,
2188                                                   le64toh(d->data.entry_offset),
2189                                                   le64toh(d->data.entry_array_offset),
2190                                                   le64toh(d->data.n_entries),
2191                                                   z,
2192                                                   test_object_offset,
2193                                                   direction,
2194                                                   NULL, &p, NULL);
2195                 if (r <= 0)
2196                         return r;
2197
2198                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2199                 if (r < 0)
2200                         return r;
2201
2202                 r = generic_array_bisect_plus_one(f,
2203                                                   le64toh(o->data.entry_offset),
2204                                                   le64toh(o->data.entry_array_offset),
2205                                                   le64toh(o->data.n_entries),
2206                                                   p,
2207                                                   test_object_offset,
2208                                                   direction,
2209                                                   &qo, &q, NULL);
2210
2211                 if (r <= 0)
2212                         return r;
2213
2214                 if (p == q) {
2215                         if (ret)
2216                                 *ret = qo;
2217                         if (offset)
2218                                 *offset = q;
2219
2220                         return 1;
2221                 }
2222
2223                 z = q;
2224         }
2225 }
2226
2227 int journal_file_move_to_entry_by_seqnum_for_data(
2228                 JournalFile *f,
2229                 uint64_t data_offset,
2230                 uint64_t seqnum,
2231                 direction_t direction,
2232                 Object **ret, uint64_t *offset) {
2233
2234         Object *d;
2235         int r;
2236
2237         assert(f);
2238
2239         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2240         if (r < 0)
2241                 return r;
2242
2243         return generic_array_bisect_plus_one(f,
2244                                              le64toh(d->data.entry_offset),
2245                                              le64toh(d->data.entry_array_offset),
2246                                              le64toh(d->data.n_entries),
2247                                              seqnum,
2248                                              test_object_seqnum,
2249                                              direction,
2250                                              ret, offset, NULL);
2251 }
2252
2253 int journal_file_move_to_entry_by_realtime_for_data(
2254                 JournalFile *f,
2255                 uint64_t data_offset,
2256                 uint64_t realtime,
2257                 direction_t direction,
2258                 Object **ret, uint64_t *offset) {
2259
2260         Object *d;
2261         int r;
2262
2263         assert(f);
2264
2265         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2266         if (r < 0)
2267                 return r;
2268
2269         return generic_array_bisect_plus_one(f,
2270                                              le64toh(d->data.entry_offset),
2271                                              le64toh(d->data.entry_array_offset),
2272                                              le64toh(d->data.n_entries),
2273                                              realtime,
2274                                              test_object_realtime,
2275                                              direction,
2276                                              ret, offset, NULL);
2277 }
2278
2279 void journal_file_dump(JournalFile *f) {
2280         Object *o;
2281         int r;
2282         uint64_t p;
2283
2284         assert(f);
2285
2286         journal_file_print_header(f);
2287
2288         p = le64toh(f->header->header_size);
2289         while (p != 0) {
2290                 r = journal_file_move_to_object(f, -1, p, &o);
2291                 if (r < 0)
2292                         goto fail;
2293
2294                 switch (o->object.type) {
2295
2296                 case OBJECT_UNUSED:
2297                         printf("Type: OBJECT_UNUSED\n");
2298                         break;
2299
2300                 case OBJECT_DATA:
2301                         printf("Type: OBJECT_DATA\n");
2302                         break;
2303
2304                 case OBJECT_FIELD:
2305                         printf("Type: OBJECT_FIELD\n");
2306                         break;
2307
2308                 case OBJECT_ENTRY:
2309                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2310                                le64toh(o->entry.seqnum),
2311                                le64toh(o->entry.monotonic),
2312                                le64toh(o->entry.realtime));
2313                         break;
2314
2315                 case OBJECT_FIELD_HASH_TABLE:
2316                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2317                         break;
2318
2319                 case OBJECT_DATA_HASH_TABLE:
2320                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2321                         break;
2322
2323                 case OBJECT_ENTRY_ARRAY:
2324                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2325                         break;
2326
2327                 case OBJECT_TAG:
2328                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2329                                le64toh(o->tag.seqnum),
2330                                le64toh(o->tag.epoch));
2331                         break;
2332
2333                 default:
2334                         printf("Type: unknown (%u)\n", o->object.type);
2335                         break;
2336                 }
2337
2338                 if (o->object.flags & OBJECT_COMPRESSED)
2339                         printf("Flags: COMPRESSED\n");
2340
2341                 if (p == le64toh(f->header->tail_object_offset))
2342                         p = 0;
2343                 else
2344                         p = p + ALIGN64(le64toh(o->object.size));
2345         }
2346
2347         return;
2348 fail:
2349         log_error("File corrupt");
2350 }
2351
2352 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2353         const char *x;
2354
2355         x = format_timestamp(buf, l, t);
2356         if (x)
2357                 return x;
2358         return " --- ";
2359 }
2360
2361 void journal_file_print_header(JournalFile *f) {
2362         char a[33], b[33], c[33], d[33];
2363         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2364         struct stat st;
2365         char bytes[FORMAT_BYTES_MAX];
2366
2367         assert(f);
2368
2369         printf("File Path: %s\n"
2370                "File ID: %s\n"
2371                "Machine ID: %s\n"
2372                "Boot ID: %s\n"
2373                "Sequential Number ID: %s\n"
2374                "State: %s\n"
2375                "Compatible Flags:%s%s\n"
2376                "Incompatible Flags:%s%s\n"
2377                "Header size: %"PRIu64"\n"
2378                "Arena size: %"PRIu64"\n"
2379                "Data Hash Table Size: %"PRIu64"\n"
2380                "Field Hash Table Size: %"PRIu64"\n"
2381                "Rotate Suggested: %s\n"
2382                "Head Sequential Number: %"PRIu64"\n"
2383                "Tail Sequential Number: %"PRIu64"\n"
2384                "Head Realtime Timestamp: %s\n"
2385                "Tail Realtime Timestamp: %s\n"
2386                "Tail Monotonic Timestamp: %s\n"
2387                "Objects: %"PRIu64"\n"
2388                "Entry Objects: %"PRIu64"\n",
2389                f->path,
2390                sd_id128_to_string(f->header->file_id, a),
2391                sd_id128_to_string(f->header->machine_id, b),
2392                sd_id128_to_string(f->header->boot_id, c),
2393                sd_id128_to_string(f->header->seqnum_id, d),
2394                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2395                f->header->state == STATE_ONLINE ? "ONLINE" :
2396                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2397                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2398                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2399                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2400                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2401                le64toh(f->header->header_size),
2402                le64toh(f->header->arena_size),
2403                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2404                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2405                yes_no(journal_file_rotate_suggested(f, 0)),
2406                le64toh(f->header->head_entry_seqnum),
2407                le64toh(f->header->tail_entry_seqnum),
2408                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2409                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2410                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2411                le64toh(f->header->n_objects),
2412                le64toh(f->header->n_entries));
2413
2414         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2415                 printf("Data Objects: %"PRIu64"\n"
2416                        "Data Hash Table Fill: %.1f%%\n",
2417                        le64toh(f->header->n_data),
2418                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2419
2420         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2421                 printf("Field Objects: %"PRIu64"\n"
2422                        "Field Hash Table Fill: %.1f%%\n",
2423                        le64toh(f->header->n_fields),
2424                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2425
2426         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2427                 printf("Tag Objects: %"PRIu64"\n",
2428                        le64toh(f->header->n_tags));
2429         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2430                 printf("Entry Array Objects: %"PRIu64"\n",
2431                        le64toh(f->header->n_entry_arrays));
2432
2433         if (fstat(f->fd, &st) >= 0)
2434                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2435 }
2436
2437 int journal_file_open(
2438                 const char *fname,
2439                 int flags,
2440                 mode_t mode,
2441                 bool compress,
2442                 bool seal,
2443                 JournalMetrics *metrics,
2444                 MMapCache *mmap_cache,
2445                 JournalFile *template,
2446                 JournalFile **ret) {
2447
2448         JournalFile *f;
2449         int r;
2450         bool newly_created = false;
2451
2452         assert(fname);
2453         assert(ret);
2454
2455         if ((flags & O_ACCMODE) != O_RDONLY &&
2456             (flags & O_ACCMODE) != O_RDWR)
2457                 return -EINVAL;
2458
2459         if (!endswith(fname, ".journal") &&
2460             !endswith(fname, ".journal~"))
2461                 return -EINVAL;
2462
2463         f = new0(JournalFile, 1);
2464         if (!f)
2465                 return -ENOMEM;
2466
2467         f->fd = -1;
2468         f->mode = mode;
2469
2470         f->flags = flags;
2471         f->prot = prot_from_flags(flags);
2472         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2473 #ifdef HAVE_XZ
2474         f->compress = compress;
2475 #endif
2476 #ifdef HAVE_GCRYPT
2477         f->seal = seal;
2478 #endif
2479
2480         if (mmap_cache)
2481                 f->mmap = mmap_cache_ref(mmap_cache);
2482         else {
2483                 f->mmap = mmap_cache_new();
2484                 if (!f->mmap) {
2485                         r = -ENOMEM;
2486                         goto fail;
2487                 }
2488         }
2489
2490         f->path = strdup(fname);
2491         if (!f->path) {
2492                 r = -ENOMEM;
2493                 goto fail;
2494         }
2495
2496         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2497         if (!f->chain_cache) {
2498                 r = -ENOMEM;
2499                 goto fail;
2500         }
2501
2502         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2503         if (f->fd < 0) {
2504                 r = -errno;
2505                 goto fail;
2506         }
2507
2508         if (fstat(f->fd, &f->last_stat) < 0) {
2509                 r = -errno;
2510                 goto fail;
2511         }
2512
2513         if (f->last_stat.st_size == 0 && f->writable) {
2514 #ifdef HAVE_XATTR
2515                 uint64_t crtime;
2516
2517                 /* Let's attach the creation time to the journal file,
2518                  * so that the vacuuming code knows the age of this
2519                  * file even if the file might end up corrupted one
2520                  * day... Ideally we'd just use the creation time many
2521                  * file systems maintain for each file, but there is
2522                  * currently no usable API to query this, hence let's
2523                  * emulate this via extended attributes. If extended
2524                  * attributes are not supported we'll just skip this,
2525                  * and rely solely on mtime/atime/ctime of the file.*/
2526
2527                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2528                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2529 #endif
2530
2531 #ifdef HAVE_GCRYPT
2532                 /* Try to load the FSPRG state, and if we can't, then
2533                  * just don't do sealing */
2534                 if (f->seal) {
2535                         r = journal_file_fss_load(f);
2536                         if (r < 0)
2537                                 f->seal = false;
2538                 }
2539 #endif
2540
2541                 r = journal_file_init_header(f, template);
2542                 if (r < 0)
2543                         goto fail;
2544
2545                 if (fstat(f->fd, &f->last_stat) < 0) {
2546                         r = -errno;
2547                         goto fail;
2548                 }
2549
2550                 newly_created = true;
2551         }
2552
2553         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2554                 r = -EIO;
2555                 goto fail;
2556         }
2557
2558         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2559         if (f->header == MAP_FAILED) {
2560                 f->header = NULL;
2561                 r = -errno;
2562                 goto fail;
2563         }
2564
2565         if (!newly_created) {
2566                 r = journal_file_verify_header(f);
2567                 if (r < 0)
2568                         goto fail;
2569         }
2570
2571 #ifdef HAVE_GCRYPT
2572         if (!newly_created && f->writable) {
2573                 r = journal_file_fss_load(f);
2574                 if (r < 0)
2575                         goto fail;
2576         }
2577 #endif
2578
2579         if (f->writable) {
2580                 if (metrics) {
2581                         journal_default_metrics(metrics, f->fd);
2582                         f->metrics = *metrics;
2583                 } else if (template)
2584                         f->metrics = template->metrics;
2585
2586                 r = journal_file_refresh_header(f);
2587                 if (r < 0)
2588                         goto fail;
2589         }
2590
2591 #ifdef HAVE_GCRYPT
2592         r = journal_file_hmac_setup(f);
2593         if (r < 0)
2594                 goto fail;
2595 #endif
2596
2597         if (newly_created) {
2598                 r = journal_file_setup_field_hash_table(f);
2599                 if (r < 0)
2600                         goto fail;
2601
2602                 r = journal_file_setup_data_hash_table(f);
2603                 if (r < 0)
2604                         goto fail;
2605
2606 #ifdef HAVE_GCRYPT
2607                 r = journal_file_append_first_tag(f);
2608                 if (r < 0)
2609                         goto fail;
2610 #endif
2611         }
2612
2613         r = journal_file_map_field_hash_table(f);
2614         if (r < 0)
2615                 goto fail;
2616
2617         r = journal_file_map_data_hash_table(f);
2618         if (r < 0)
2619                 goto fail;
2620
2621         *ret = f;
2622         return 0;
2623
2624 fail:
2625         journal_file_close(f);
2626
2627         return r;
2628 }
2629
2630 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2631         _cleanup_free_ char *p = NULL;
2632         size_t l;
2633         JournalFile *old_file, *new_file = NULL;
2634         int r;
2635
2636         assert(f);
2637         assert(*f);
2638
2639         old_file = *f;
2640
2641         if (!old_file->writable)
2642                 return -EINVAL;
2643
2644         if (!endswith(old_file->path, ".journal"))
2645                 return -EINVAL;
2646
2647         l = strlen(old_file->path);
2648         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2649                      (int) l - 8, old_file->path,
2650                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2651                      le64toh((*f)->header->head_entry_seqnum),
2652                      le64toh((*f)->header->head_entry_realtime));
2653         if (r < 0)
2654                 return -ENOMEM;
2655
2656         r = rename(old_file->path, p);
2657         if (r < 0)
2658                 return -errno;
2659
2660         old_file->header->state = STATE_ARCHIVED;
2661
2662         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2663         journal_file_close(old_file);
2664
2665         *f = new_file;
2666         return r;
2667 }
2668
2669 int journal_file_open_reliably(
2670                 const char *fname,
2671                 int flags,
2672                 mode_t mode,
2673                 bool compress,
2674                 bool seal,
2675                 JournalMetrics *metrics,
2676                 MMapCache *mmap_cache,
2677                 JournalFile *template,
2678                 JournalFile **ret) {
2679
2680         int r;
2681         size_t l;
2682         _cleanup_free_ char *p = NULL;
2683
2684         r = journal_file_open(fname, flags, mode, compress, seal,
2685                               metrics, mmap_cache, template, ret);
2686         if (r != -EBADMSG && /* corrupted */
2687             r != -ENODATA && /* truncated */
2688             r != -EHOSTDOWN && /* other machine */
2689             r != -EPROTONOSUPPORT && /* incompatible feature */
2690             r != -EBUSY && /* unclean shutdown */
2691             r != -ESHUTDOWN /* already archived */)
2692                 return r;
2693
2694         if ((flags & O_ACCMODE) == O_RDONLY)
2695                 return r;
2696
2697         if (!(flags & O_CREAT))
2698                 return r;
2699
2700         if (!endswith(fname, ".journal"))
2701                 return r;
2702
2703         /* The file is corrupted. Rotate it away and try it again (but only once) */
2704
2705         l = strlen(fname);
2706         if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2707                      (int) l - 8, fname,
2708                      (unsigned long long) now(CLOCK_REALTIME),
2709                      random_u64()) < 0)
2710                 return -ENOMEM;
2711
2712         r = rename(fname, p);
2713         if (r < 0)
2714                 return -errno;
2715
2716         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2717
2718         return journal_file_open(fname, flags, mode, compress, seal,
2719                                  metrics, mmap_cache, template, ret);
2720 }
2721
2722 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2723         uint64_t i, n;
2724         uint64_t q, xor_hash = 0;
2725         int r;
2726         EntryItem *items;
2727         dual_timestamp ts;
2728
2729         assert(from);
2730         assert(to);
2731         assert(o);
2732         assert(p);
2733
2734         if (!to->writable)
2735                 return -EPERM;
2736
2737         ts.monotonic = le64toh(o->entry.monotonic);
2738         ts.realtime = le64toh(o->entry.realtime);
2739
2740         n = journal_file_entry_n_items(o);
2741         /* alloca() can't take 0, hence let's allocate at least one */
2742         items = alloca(sizeof(EntryItem) * MAX(1u, n));
2743
2744         for (i = 0; i < n; i++) {
2745                 uint64_t l, h;
2746                 le64_t le_hash;
2747                 size_t t;
2748                 void *data;
2749                 Object *u;
2750
2751                 q = le64toh(o->entry.items[i].object_offset);
2752                 le_hash = o->entry.items[i].hash;
2753
2754                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2755                 if (r < 0)
2756                         return r;
2757
2758                 if (le_hash != o->data.hash)
2759                         return -EBADMSG;
2760
2761                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2762                 t = (size_t) l;
2763
2764                 /* We hit the limit on 32bit machines */
2765                 if ((uint64_t) t != l)
2766                         return -E2BIG;
2767
2768                 if (o->object.flags & OBJECT_COMPRESSED) {
2769 #ifdef HAVE_XZ
2770                         uint64_t rsize;
2771
2772                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2773                                 return -EBADMSG;
2774
2775                         data = from->compress_buffer;
2776                         l = rsize;
2777 #else
2778                         return -EPROTONOSUPPORT;
2779 #endif
2780                 } else
2781                         data = o->data.payload;
2782
2783                 r = journal_file_append_data(to, data, l, &u, &h);
2784                 if (r < 0)
2785                         return r;
2786
2787                 xor_hash ^= le64toh(u->data.hash);
2788                 items[i].object_offset = htole64(h);
2789                 items[i].hash = u->data.hash;
2790
2791                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2792                 if (r < 0)
2793                         return r;
2794         }
2795
2796         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2797 }
2798
2799 void journal_default_metrics(JournalMetrics *m, int fd) {
2800         uint64_t fs_size = 0;
2801         struct statvfs ss;
2802         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2803
2804         assert(m);
2805         assert(fd >= 0);
2806
2807         if (fstatvfs(fd, &ss) >= 0)
2808                 fs_size = ss.f_frsize * ss.f_blocks;
2809
2810         if (m->max_use == (uint64_t) -1) {
2811
2812                 if (fs_size > 0) {
2813                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2814
2815                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2816                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2817
2818                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2819                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2820                 } else
2821                         m->max_use = DEFAULT_MAX_USE_LOWER;
2822         } else {
2823                 m->max_use = PAGE_ALIGN(m->max_use);
2824
2825                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2826                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2827         }
2828
2829         if (m->max_size == (uint64_t) -1) {
2830                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2831
2832                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2833                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2834         } else
2835                 m->max_size = PAGE_ALIGN(m->max_size);
2836
2837         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2838                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2839
2840         if (m->max_size*2 > m->max_use)
2841                 m->max_use = m->max_size*2;
2842
2843         if (m->min_size == (uint64_t) -1)
2844                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2845         else {
2846                 m->min_size = PAGE_ALIGN(m->min_size);
2847
2848                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2849                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2850
2851                 if (m->min_size > m->max_size)
2852                         m->max_size = m->min_size;
2853         }
2854
2855         if (m->keep_free == (uint64_t) -1) {
2856
2857                 if (fs_size > 0) {
2858                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2859
2860                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2861                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2862
2863                 } else
2864                         m->keep_free = DEFAULT_KEEP_FREE;
2865         }
2866
2867         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2868                   format_bytes(a, sizeof(a), m->max_use),
2869                   format_bytes(b, sizeof(b), m->max_size),
2870                   format_bytes(c, sizeof(c), m->min_size),
2871                   format_bytes(d, sizeof(d), m->keep_free));
2872 }
2873
2874 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2875         assert(f);
2876         assert(from || to);
2877
2878         if (from) {
2879                 if (f->header->head_entry_realtime == 0)
2880                         return -ENOENT;
2881
2882                 *from = le64toh(f->header->head_entry_realtime);
2883         }
2884
2885         if (to) {
2886                 if (f->header->tail_entry_realtime == 0)
2887                         return -ENOENT;
2888
2889                 *to = le64toh(f->header->tail_entry_realtime);
2890         }
2891
2892         return 1;
2893 }
2894
2895 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2896         Object *o;
2897         uint64_t p;
2898         int r;
2899
2900         assert(f);
2901         assert(from || to);
2902
2903         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2904         if (r <= 0)
2905                 return r;
2906
2907         if (le64toh(o->data.n_entries) <= 0)
2908                 return 0;
2909
2910         if (from) {
2911                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2912                 if (r < 0)
2913                         return r;
2914
2915                 *from = le64toh(o->entry.monotonic);
2916         }
2917
2918         if (to) {
2919                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2920                 if (r < 0)
2921                         return r;
2922
2923                 r = generic_array_get_plus_one(f,
2924                                                le64toh(o->data.entry_offset),
2925                                                le64toh(o->data.entry_array_offset),
2926                                                le64toh(o->data.n_entries)-1,
2927                                                &o, NULL);
2928                 if (r <= 0)
2929                         return r;
2930
2931                 *to = le64toh(o->entry.monotonic);
2932         }
2933
2934         return 1;
2935 }
2936
2937 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2938         assert(f);
2939
2940         /* If we gained new header fields we gained new features,
2941          * hence suggest a rotation */
2942         if (le64toh(f->header->header_size) < sizeof(Header)) {
2943                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2944                 return true;
2945         }
2946
2947         /* Let's check if the hash tables grew over a certain fill
2948          * level (75%, borrowing this value from Java's hash table
2949          * implementation), and if so suggest a rotation. To calculate
2950          * the fill level we need the n_data field, which only exists
2951          * in newer versions. */
2952
2953         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2954                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2955                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2956                                   f->path,
2957                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2958                                   le64toh(f->header->n_data),
2959                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2960                                   (unsigned long long) f->last_stat.st_size,
2961                                   f->last_stat.st_size / le64toh(f->header->n_data));
2962                         return true;
2963                 }
2964
2965         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2966                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2967                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2968                                   f->path,
2969                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2970                                   le64toh(f->header->n_fields),
2971                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2972                         return true;
2973                 }
2974
2975         /* Are the data objects properly indexed by field objects? */
2976         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2977             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2978             le64toh(f->header->n_data) > 0 &&
2979             le64toh(f->header->n_fields) == 0)
2980                 return true;
2981
2982         if (max_file_usec > 0) {
2983                 usec_t t, h;
2984
2985                 h = le64toh(f->header->head_entry_realtime);
2986                 t = now(CLOCK_REALTIME);
2987
2988                 if (h > 0 && t > h + max_file_usec)
2989                         return true;
2990         }
2991
2992         return false;
2993 }