chiark / gitweb /
udev: declare some symbols static
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 static int journal_file_set_online(JournalFile *f) {
72         assert(f);
73
74         if (!f->writable)
75                 return -EPERM;
76
77         if (!(f->fd >= 0 && f->header))
78                 return -EINVAL;
79
80         switch(f->header->state) {
81                 case STATE_ONLINE:
82                         return 0;
83
84                 case STATE_OFFLINE:
85                         f->header->state = STATE_ONLINE;
86                         fsync(f->fd);
87                         return 0;
88
89                 default:
90                         return -EINVAL;
91         }
92 }
93
94 int journal_file_set_offline(JournalFile *f) {
95         assert(f);
96
97         if (!f->writable)
98                 return -EPERM;
99
100         if (!(f->fd >= 0 && f->header))
101                 return -EINVAL;
102
103         if (f->header->state != STATE_ONLINE)
104                 return 0;
105
106         fsync(f->fd);
107
108         f->header->state = STATE_OFFLINE;
109
110         fsync(f->fd);
111
112         return 0;
113 }
114
115 void journal_file_close(JournalFile *f) {
116         assert(f);
117
118 #ifdef HAVE_GCRYPT
119         /* Write the final tag */
120         if (f->seal && f->writable)
121                 journal_file_append_tag(f);
122 #endif
123
124         /* Sync everything to disk, before we mark the file offline */
125         if (f->mmap && f->fd >= 0)
126                 mmap_cache_close_fd(f->mmap, f->fd);
127
128         journal_file_set_offline(f);
129
130         if (f->header)
131                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
132
133         if (f->fd >= 0)
134                 close_nointr_nofail(f->fd);
135
136         free(f->path);
137
138         if (f->mmap)
139                 mmap_cache_unref(f->mmap);
140
141         hashmap_free_free(f->chain_cache);
142
143 #ifdef HAVE_XZ
144         free(f->compress_buffer);
145 #endif
146
147 #ifdef HAVE_GCRYPT
148         if (f->fss_file)
149                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
150         else if (f->fsprg_state)
151                 free(f->fsprg_state);
152
153         free(f->fsprg_seed);
154
155         if (f->hmac)
156                 gcry_md_close(f->hmac);
157 #endif
158
159         free(f);
160 }
161
162 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
163         Header h;
164         ssize_t k;
165         int r;
166
167         assert(f);
168
169         zero(h);
170         memcpy(h.signature, HEADER_SIGNATURE, 8);
171         h.header_size = htole64(ALIGN64(sizeof(h)));
172
173         h.incompatible_flags =
174                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
175
176         h.compatible_flags =
177                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
178
179         r = sd_id128_randomize(&h.file_id);
180         if (r < 0)
181                 return r;
182
183         if (template) {
184                 h.seqnum_id = template->header->seqnum_id;
185                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
186         } else
187                 h.seqnum_id = h.file_id;
188
189         k = pwrite(f->fd, &h, sizeof(h), 0);
190         if (k < 0)
191                 return -errno;
192
193         if (k != sizeof(h))
194                 return -EIO;
195
196         return 0;
197 }
198
199 static int journal_file_refresh_header(JournalFile *f) {
200         int r;
201         sd_id128_t boot_id;
202
203         assert(f);
204
205         r = sd_id128_get_machine(&f->header->machine_id);
206         if (r < 0)
207                 return r;
208
209         r = sd_id128_get_boot(&boot_id);
210         if (r < 0)
211                 return r;
212
213         if (sd_id128_equal(boot_id, f->header->boot_id))
214                 f->tail_entry_monotonic_valid = true;
215
216         f->header->boot_id = boot_id;
217
218         journal_file_set_online(f);
219
220         /* Sync the online state to disk */
221         msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
222         fdatasync(f->fd);
223
224         return 0;
225 }
226
227 static int journal_file_verify_header(JournalFile *f) {
228         assert(f);
229
230         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
231                 return -EBADMSG;
232
233         /* In both read and write mode we refuse to open files with
234          * incompatible flags we don't know */
235 #ifdef HAVE_XZ
236         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
237                 return -EPROTONOSUPPORT;
238 #else
239         if (f->header->incompatible_flags != 0)
240                 return -EPROTONOSUPPORT;
241 #endif
242
243         /* When open for writing we refuse to open files with
244          * compatible flags, too */
245         if (f->writable) {
246 #ifdef HAVE_GCRYPT
247                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
248                         return -EPROTONOSUPPORT;
249 #else
250                 if (f->header->compatible_flags != 0)
251                         return -EPROTONOSUPPORT;
252 #endif
253         }
254
255         if (f->header->state >= _STATE_MAX)
256                 return -EBADMSG;
257
258         /* The first addition was n_data, so check that we are at least this large */
259         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
260                 return -EBADMSG;
261
262         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
263                 return -EBADMSG;
264
265         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
266                 return -ENODATA;
267
268         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
269                 return -ENODATA;
270
271         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
272             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
273             !VALID64(le64toh(f->header->tail_object_offset)) ||
274             !VALID64(le64toh(f->header->entry_array_offset)))
275                 return -ENODATA;
276
277         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
278             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
279             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
281                 return -ENODATA;
282
283         if (f->writable) {
284                 uint8_t state;
285                 sd_id128_t machine_id;
286                 int r;
287
288                 r = sd_id128_get_machine(&machine_id);
289                 if (r < 0)
290                         return r;
291
292                 if (!sd_id128_equal(machine_id, f->header->machine_id))
293                         return -EHOSTDOWN;
294
295                 state = f->header->state;
296
297                 if (state == STATE_ONLINE) {
298                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
299                         return -EBUSY;
300                 } else if (state == STATE_ARCHIVED)
301                         return -ESHUTDOWN;
302                 else if (state != STATE_OFFLINE) {
303                         log_debug("Journal file %s has unknown state %u.", f->path, state);
304                         return -EBUSY;
305                 }
306         }
307
308         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
309
310         f->seal = JOURNAL_HEADER_SEALED(f->header);
311
312         return 0;
313 }
314
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316         uint64_t old_size, new_size;
317         int r;
318
319         assert(f);
320
321         /* We assume that this file is not sparse, and we know that
322          * for sure, since we always call posix_fallocate()
323          * ourselves */
324
325         old_size =
326                 le64toh(f->header->header_size) +
327                 le64toh(f->header->arena_size);
328
329         new_size = PAGE_ALIGN(offset + size);
330         if (new_size < le64toh(f->header->header_size))
331                 new_size = le64toh(f->header->header_size);
332
333         if (new_size <= old_size)
334                 return 0;
335
336         if (f->metrics.max_size > 0 &&
337             new_size > f->metrics.max_size)
338                 return -E2BIG;
339
340         if (new_size > f->metrics.min_size &&
341             f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Note that the glibc fallocate() fallback is very
360            inefficient, hence we try to minimize the allocation area
361            as we can. */
362         r = posix_fallocate(f->fd, old_size, new_size - old_size);
363         if (r != 0)
364                 return -r;
365
366         if (fstat(f->fd, &f->last_stat) < 0)
367                 return -errno;
368
369         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
370
371         return 0;
372 }
373
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
375         assert(f);
376         assert(ret);
377
378         if (size <= 0)
379                 return -EINVAL;
380
381         /* Avoid SIGBUS on invalid accesses */
382         if (offset + size > (uint64_t) f->last_stat.st_size) {
383                 /* Hmm, out of range? Let's refresh the fstat() data
384                  * first, before we trust that check. */
385
386                 if (fstat(f->fd, &f->last_stat) < 0 ||
387                     offset + size > (uint64_t) f->last_stat.st_size)
388                         return -EADDRNOTAVAIL;
389         }
390
391         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
392 }
393
394 static uint64_t minimum_header_size(Object *o) {
395
396         static uint64_t table[] = {
397                 [OBJECT_DATA] = sizeof(DataObject),
398                 [OBJECT_FIELD] = sizeof(FieldObject),
399                 [OBJECT_ENTRY] = sizeof(EntryObject),
400                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403                 [OBJECT_TAG] = sizeof(TagObject),
404         };
405
406         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407                 return sizeof(ObjectHeader);
408
409         return table[o->object.type];
410 }
411
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
413         int r;
414         void *t;
415         Object *o;
416         uint64_t s;
417         unsigned context;
418
419         assert(f);
420         assert(ret);
421
422         /* Objects may only be located at multiple of 64 bit */
423         if (!VALID64(offset))
424                 return -EFAULT;
425
426         /* One context for each type, plus one catch-all for the rest */
427         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
428
429         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
430         if (r < 0)
431                 return r;
432
433         o = (Object*) t;
434         s = le64toh(o->object.size);
435
436         if (s < sizeof(ObjectHeader))
437                 return -EBADMSG;
438
439         if (o->object.type <= OBJECT_UNUSED)
440                 return -EBADMSG;
441
442         if (s < minimum_header_size(o))
443                 return -EBADMSG;
444
445         if (type > 0 && o->object.type != type)
446                 return -EBADMSG;
447
448         if (s > sizeof(ObjectHeader)) {
449                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
450                 if (r < 0)
451                         return r;
452
453                 o = (Object*) t;
454         }
455
456         *ret = o;
457         return 0;
458 }
459
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
461         uint64_t r;
462
463         assert(f);
464
465         r = le64toh(f->header->tail_entry_seqnum) + 1;
466
467         if (seqnum) {
468                 /* If an external seqnum counter was passed, we update
469                  * both the local and the external one, and set it to
470                  * the maximum of both */
471
472                 if (*seqnum + 1 > r)
473                         r = *seqnum + 1;
474
475                 *seqnum = r;
476         }
477
478         f->header->tail_entry_seqnum = htole64(r);
479
480         if (f->header->head_entry_seqnum == 0)
481                 f->header->head_entry_seqnum = htole64(r);
482
483         return r;
484 }
485
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
487         int r;
488         uint64_t p;
489         Object *tail, *o;
490         void *t;
491
492         assert(f);
493         assert(type > 0 && type < _OBJECT_TYPE_MAX);
494         assert(size >= sizeof(ObjectHeader));
495         assert(offset);
496         assert(ret);
497
498         r = journal_file_set_online(f);
499         if (r < 0)
500                 return r;
501
502         p = le64toh(f->header->tail_object_offset);
503         if (p == 0)
504                 p = le64toh(f->header->header_size);
505         else {
506                 r = journal_file_move_to_object(f, -1, p, &tail);
507                 if (r < 0)
508                         return r;
509
510                 p += ALIGN64(le64toh(tail->object.size));
511         }
512
513         r = journal_file_allocate(f, p, size);
514         if (r < 0)
515                 return r;
516
517         r = journal_file_move_to(f, type, false, p, size, &t);
518         if (r < 0)
519                 return r;
520
521         o = (Object*) t;
522
523         zero(o->object);
524         o->object.type = type;
525         o->object.size = htole64(size);
526
527         f->header->tail_object_offset = htole64(p);
528         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
529
530         *ret = o;
531         *offset = p;
532
533         return 0;
534 }
535
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
537         uint64_t s, p;
538         Object *o;
539         int r;
540
541         assert(f);
542
543         /* We estimate that we need 1 hash table entry per 768 of
544            journal file and we want to make sure we never get beyond
545            75% fill level. Calculate the hash table size for the
546            maximum file size based on these metrics. */
547
548         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
551
552         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
553
554         r = journal_file_append_object(f,
555                                        OBJECT_DATA_HASH_TABLE,
556                                        offsetof(Object, hash_table.items) + s,
557                                        &o, &p);
558         if (r < 0)
559                 return r;
560
561         memset(o->hash_table.items, 0, s);
562
563         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564         f->header->data_hash_table_size = htole64(s);
565
566         return 0;
567 }
568
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
570         uint64_t s, p;
571         Object *o;
572         int r;
573
574         assert(f);
575
576         /* We use a fixed size hash table for the fields as this
577          * number should grow very slowly only */
578
579         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580         r = journal_file_append_object(f,
581                                        OBJECT_FIELD_HASH_TABLE,
582                                        offsetof(Object, hash_table.items) + s,
583                                        &o, &p);
584         if (r < 0)
585                 return r;
586
587         memset(o->hash_table.items, 0, s);
588
589         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590         f->header->field_hash_table_size = htole64(s);
591
592         return 0;
593 }
594
595 static int journal_file_map_data_hash_table(JournalFile *f) {
596         uint64_t s, p;
597         void *t;
598         int r;
599
600         assert(f);
601
602         p = le64toh(f->header->data_hash_table_offset);
603         s = le64toh(f->header->data_hash_table_size);
604
605         r = journal_file_move_to(f,
606                                  OBJECT_DATA_HASH_TABLE,
607                                  true,
608                                  p, s,
609                                  &t);
610         if (r < 0)
611                 return r;
612
613         f->data_hash_table = t;
614         return 0;
615 }
616
617 static int journal_file_map_field_hash_table(JournalFile *f) {
618         uint64_t s, p;
619         void *t;
620         int r;
621
622         assert(f);
623
624         p = le64toh(f->header->field_hash_table_offset);
625         s = le64toh(f->header->field_hash_table_size);
626
627         r = journal_file_move_to(f,
628                                  OBJECT_FIELD_HASH_TABLE,
629                                  true,
630                                  p, s,
631                                  &t);
632         if (r < 0)
633                 return r;
634
635         f->field_hash_table = t;
636         return 0;
637 }
638
639 static int journal_file_link_field(
640                 JournalFile *f,
641                 Object *o,
642                 uint64_t offset,
643                 uint64_t hash) {
644
645         uint64_t p, h;
646         int r;
647
648         assert(f);
649         assert(o);
650         assert(offset > 0);
651
652         if (o->object.type != OBJECT_FIELD)
653                 return -EINVAL;
654
655         /* This might alter the window we are looking at */
656
657         o->field.next_hash_offset = o->field.head_data_offset = 0;
658
659         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660         p = le64toh(f->field_hash_table[h].tail_hash_offset);
661         if (p == 0)
662                 f->field_hash_table[h].head_hash_offset = htole64(offset);
663         else {
664                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
665                 if (r < 0)
666                         return r;
667
668                 o->field.next_hash_offset = htole64(offset);
669         }
670
671         f->field_hash_table[h].tail_hash_offset = htole64(offset);
672
673         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
675
676         return 0;
677 }
678
679 static int journal_file_link_data(
680                 JournalFile *f,
681                 Object *o,
682                 uint64_t offset,
683                 uint64_t hash) {
684
685         uint64_t p, h;
686         int r;
687
688         assert(f);
689         assert(o);
690         assert(offset > 0);
691
692         if (o->object.type != OBJECT_DATA)
693                 return -EINVAL;
694
695         /* This might alter the window we are looking at */
696
697         o->data.next_hash_offset = o->data.next_field_offset = 0;
698         o->data.entry_offset = o->data.entry_array_offset = 0;
699         o->data.n_entries = 0;
700
701         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702         p = le64toh(f->data_hash_table[h].tail_hash_offset);
703         if (p == 0)
704                 /* Only entry in the hash table is easy */
705                 f->data_hash_table[h].head_hash_offset = htole64(offset);
706         else {
707                 /* Move back to the previous data object, to patch in
708                  * pointer */
709
710                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
711                 if (r < 0)
712                         return r;
713
714                 o->data.next_hash_offset = htole64(offset);
715         }
716
717         f->data_hash_table[h].tail_hash_offset = htole64(offset);
718
719         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
721
722         return 0;
723 }
724
725 int journal_file_find_field_object_with_hash(
726                 JournalFile *f,
727                 const void *field, uint64_t size, uint64_t hash,
728                 Object **ret, uint64_t *offset) {
729
730         uint64_t p, osize, h;
731         int r;
732
733         assert(f);
734         assert(field && size > 0);
735
736         osize = offsetof(Object, field.payload) + size;
737
738         if (f->header->field_hash_table_size == 0)
739                 return -EBADMSG;
740
741         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742         p = le64toh(f->field_hash_table[h].head_hash_offset);
743
744         while (p > 0) {
745                 Object *o;
746
747                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
748                 if (r < 0)
749                         return r;
750
751                 if (le64toh(o->field.hash) == hash &&
752                     le64toh(o->object.size) == osize &&
753                     memcmp(o->field.payload, field, size) == 0) {
754
755                         if (ret)
756                                 *ret = o;
757                         if (offset)
758                                 *offset = p;
759
760                         return 1;
761                 }
762
763                 p = le64toh(o->field.next_hash_offset);
764         }
765
766         return 0;
767 }
768
769 int journal_file_find_field_object(
770                 JournalFile *f,
771                 const void *field, uint64_t size,
772                 Object **ret, uint64_t *offset) {
773
774         uint64_t hash;
775
776         assert(f);
777         assert(field && size > 0);
778
779         hash = hash64(field, size);
780
781         return journal_file_find_field_object_with_hash(f,
782                                                         field, size, hash,
783                                                         ret, offset);
784 }
785
786 int journal_file_find_data_object_with_hash(
787                 JournalFile *f,
788                 const void *data, uint64_t size, uint64_t hash,
789                 Object **ret, uint64_t *offset) {
790
791         uint64_t p, osize, h;
792         int r;
793
794         assert(f);
795         assert(data || size == 0);
796
797         osize = offsetof(Object, data.payload) + size;
798
799         if (f->header->data_hash_table_size == 0)
800                 return -EBADMSG;
801
802         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803         p = le64toh(f->data_hash_table[h].head_hash_offset);
804
805         while (p > 0) {
806                 Object *o;
807
808                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
809                 if (r < 0)
810                         return r;
811
812                 if (le64toh(o->data.hash) != hash)
813                         goto next;
814
815                 if (o->object.flags & OBJECT_COMPRESSED) {
816 #ifdef HAVE_XZ
817                         uint64_t l, rsize;
818
819                         l = le64toh(o->object.size);
820                         if (l <= offsetof(Object, data.payload))
821                                 return -EBADMSG;
822
823                         l -= offsetof(Object, data.payload);
824
825                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
826                                 return -EBADMSG;
827
828                         if (rsize == size &&
829                             memcmp(f->compress_buffer, data, size) == 0) {
830
831                                 if (ret)
832                                         *ret = o;
833
834                                 if (offset)
835                                         *offset = p;
836
837                                 return 1;
838                         }
839 #else
840                         return -EPROTONOSUPPORT;
841 #endif
842
843                 } else if (le64toh(o->object.size) == osize &&
844                            memcmp(o->data.payload, data, size) == 0) {
845
846                         if (ret)
847                                 *ret = o;
848
849                         if (offset)
850                                 *offset = p;
851
852                         return 1;
853                 }
854
855         next:
856                 p = le64toh(o->data.next_hash_offset);
857         }
858
859         return 0;
860 }
861
862 int journal_file_find_data_object(
863                 JournalFile *f,
864                 const void *data, uint64_t size,
865                 Object **ret, uint64_t *offset) {
866
867         uint64_t hash;
868
869         assert(f);
870         assert(data || size == 0);
871
872         hash = hash64(data, size);
873
874         return journal_file_find_data_object_with_hash(f,
875                                                        data, size, hash,
876                                                        ret, offset);
877 }
878
879 static int journal_file_append_field(
880                 JournalFile *f,
881                 const void *field, uint64_t size,
882                 Object **ret, uint64_t *offset) {
883
884         uint64_t hash, p;
885         uint64_t osize;
886         Object *o;
887         int r;
888
889         assert(f);
890         assert(field && size > 0);
891
892         hash = hash64(field, size);
893
894         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
895         if (r < 0)
896                 return r;
897         else if (r > 0) {
898
899                 if (ret)
900                         *ret = o;
901
902                 if (offset)
903                         *offset = p;
904
905                 return 0;
906         }
907
908         osize = offsetof(Object, field.payload) + size;
909         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
910         if (r < 0)
911                 return r;
912
913         o->field.hash = htole64(hash);
914         memcpy(o->field.payload, field, size);
915
916         r = journal_file_link_field(f, o, p, hash);
917         if (r < 0)
918                 return r;
919
920         /* The linking might have altered the window, so let's
921          * refresh our pointer */
922         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
923         if (r < 0)
924                 return r;
925
926 #ifdef HAVE_GCRYPT
927         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
928         if (r < 0)
929                 return r;
930 #endif
931
932         if (ret)
933                 *ret = o;
934
935         if (offset)
936                 *offset = p;
937
938         return 0;
939 }
940
941 static int journal_file_append_data(
942                 JournalFile *f,
943                 const void *data, uint64_t size,
944                 Object **ret, uint64_t *offset) {
945
946         uint64_t hash, p;
947         uint64_t osize;
948         Object *o;
949         int r;
950         bool compressed = false;
951         const void *eq;
952
953         assert(f);
954         assert(data || size == 0);
955
956         hash = hash64(data, size);
957
958         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
959         if (r < 0)
960                 return r;
961         else if (r > 0) {
962
963                 if (ret)
964                         *ret = o;
965
966                 if (offset)
967                         *offset = p;
968
969                 return 0;
970         }
971
972         osize = offsetof(Object, data.payload) + size;
973         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
974         if (r < 0)
975                 return r;
976
977         o->data.hash = htole64(hash);
978
979 #ifdef HAVE_XZ
980         if (f->compress &&
981             size >= COMPRESSION_SIZE_THRESHOLD) {
982                 uint64_t rsize;
983
984                 compressed = compress_blob(data, size, o->data.payload, &rsize);
985
986                 if (compressed) {
987                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
988                         o->object.flags |= OBJECT_COMPRESSED;
989
990                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
991                 }
992         }
993 #endif
994
995         if (!compressed && size > 0)
996                 memcpy(o->data.payload, data, size);
997
998         r = journal_file_link_data(f, o, p, hash);
999         if (r < 0)
1000                 return r;
1001
1002         /* The linking might have altered the window, so let's
1003          * refresh our pointer */
1004         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1005         if (r < 0)
1006                 return r;
1007
1008         eq = memchr(data, '=', size);
1009         if (eq && eq > data) {
1010                 uint64_t fp;
1011                 Object *fo;
1012
1013                 /* Create field object ... */
1014                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1015                 if (r < 0)
1016                         return r;
1017
1018                 /* ... and link it in. */
1019                 o->data.next_field_offset = fo->field.head_data_offset;
1020                 fo->field.head_data_offset = le64toh(p);
1021         }
1022
1023 #ifdef HAVE_GCRYPT
1024         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1025         if (r < 0)
1026                 return r;
1027 #endif
1028
1029         if (ret)
1030                 *ret = o;
1031
1032         if (offset)
1033                 *offset = p;
1034
1035         return 0;
1036 }
1037
1038 uint64_t journal_file_entry_n_items(Object *o) {
1039         assert(o);
1040
1041         if (o->object.type != OBJECT_ENTRY)
1042                 return 0;
1043
1044         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1045 }
1046
1047 uint64_t journal_file_entry_array_n_items(Object *o) {
1048         assert(o);
1049
1050         if (o->object.type != OBJECT_ENTRY_ARRAY)
1051                 return 0;
1052
1053         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1054 }
1055
1056 uint64_t journal_file_hash_table_n_items(Object *o) {
1057         assert(o);
1058
1059         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1060             o->object.type != OBJECT_FIELD_HASH_TABLE)
1061                 return 0;
1062
1063         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1064 }
1065
1066 static int link_entry_into_array(JournalFile *f,
1067                                  le64_t *first,
1068                                  le64_t *idx,
1069                                  uint64_t p) {
1070         int r;
1071         uint64_t n = 0, ap = 0, q, i, a, hidx;
1072         Object *o;
1073
1074         assert(f);
1075         assert(first);
1076         assert(idx);
1077         assert(p > 0);
1078
1079         a = le64toh(*first);
1080         i = hidx = le64toh(*idx);
1081         while (a > 0) {
1082
1083                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1084                 if (r < 0)
1085                         return r;
1086
1087                 n = journal_file_entry_array_n_items(o);
1088                 if (i < n) {
1089                         o->entry_array.items[i] = htole64(p);
1090                         *idx = htole64(hidx + 1);
1091                         return 0;
1092                 }
1093
1094                 i -= n;
1095                 ap = a;
1096                 a = le64toh(o->entry_array.next_entry_array_offset);
1097         }
1098
1099         if (hidx > n)
1100                 n = (hidx+1) * 2;
1101         else
1102                 n = n * 2;
1103
1104         if (n < 4)
1105                 n = 4;
1106
1107         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1108                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1109                                        &o, &q);
1110         if (r < 0)
1111                 return r;
1112
1113 #ifdef HAVE_GCRYPT
1114         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1115         if (r < 0)
1116                 return r;
1117 #endif
1118
1119         o->entry_array.items[i] = htole64(p);
1120
1121         if (ap == 0)
1122                 *first = htole64(q);
1123         else {
1124                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1125                 if (r < 0)
1126                         return r;
1127
1128                 o->entry_array.next_entry_array_offset = htole64(q);
1129         }
1130
1131         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1132                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1133
1134         *idx = htole64(hidx + 1);
1135
1136         return 0;
1137 }
1138
1139 static int link_entry_into_array_plus_one(JournalFile *f,
1140                                           le64_t *extra,
1141                                           le64_t *first,
1142                                           le64_t *idx,
1143                                           uint64_t p) {
1144
1145         int r;
1146
1147         assert(f);
1148         assert(extra);
1149         assert(first);
1150         assert(idx);
1151         assert(p > 0);
1152
1153         if (*idx == 0)
1154                 *extra = htole64(p);
1155         else {
1156                 le64_t i;
1157
1158                 i = htole64(le64toh(*idx) - 1);
1159                 r = link_entry_into_array(f, first, &i, p);
1160                 if (r < 0)
1161                         return r;
1162         }
1163
1164         *idx = htole64(le64toh(*idx) + 1);
1165         return 0;
1166 }
1167
1168 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1169         uint64_t p;
1170         int r;
1171         assert(f);
1172         assert(o);
1173         assert(offset > 0);
1174
1175         p = le64toh(o->entry.items[i].object_offset);
1176         if (p == 0)
1177                 return -EINVAL;
1178
1179         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1180         if (r < 0)
1181                 return r;
1182
1183         return link_entry_into_array_plus_one(f,
1184                                               &o->data.entry_offset,
1185                                               &o->data.entry_array_offset,
1186                                               &o->data.n_entries,
1187                                               offset);
1188 }
1189
1190 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1191         uint64_t n, i;
1192         int r;
1193
1194         assert(f);
1195         assert(o);
1196         assert(offset > 0);
1197
1198         if (o->object.type != OBJECT_ENTRY)
1199                 return -EINVAL;
1200
1201         __sync_synchronize();
1202
1203         /* Link up the entry itself */
1204         r = link_entry_into_array(f,
1205                                   &f->header->entry_array_offset,
1206                                   &f->header->n_entries,
1207                                   offset);
1208         if (r < 0)
1209                 return r;
1210
1211         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1212
1213         if (f->header->head_entry_realtime == 0)
1214                 f->header->head_entry_realtime = o->entry.realtime;
1215
1216         f->header->tail_entry_realtime = o->entry.realtime;
1217         f->header->tail_entry_monotonic = o->entry.monotonic;
1218
1219         f->tail_entry_monotonic_valid = true;
1220
1221         /* Link up the items */
1222         n = journal_file_entry_n_items(o);
1223         for (i = 0; i < n; i++) {
1224                 r = journal_file_link_entry_item(f, o, offset, i);
1225                 if (r < 0)
1226                         return r;
1227         }
1228
1229         return 0;
1230 }
1231
1232 static int journal_file_append_entry_internal(
1233                 JournalFile *f,
1234                 const dual_timestamp *ts,
1235                 uint64_t xor_hash,
1236                 const EntryItem items[], unsigned n_items,
1237                 uint64_t *seqnum,
1238                 Object **ret, uint64_t *offset) {
1239         uint64_t np;
1240         uint64_t osize;
1241         Object *o;
1242         int r;
1243
1244         assert(f);
1245         assert(items || n_items == 0);
1246         assert(ts);
1247
1248         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1249
1250         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1251         if (r < 0)
1252                 return r;
1253
1254         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1255         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1256         o->entry.realtime = htole64(ts->realtime);
1257         o->entry.monotonic = htole64(ts->monotonic);
1258         o->entry.xor_hash = htole64(xor_hash);
1259         o->entry.boot_id = f->header->boot_id;
1260
1261 #ifdef HAVE_GCRYPT
1262         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1263         if (r < 0)
1264                 return r;
1265 #endif
1266
1267         r = journal_file_link_entry(f, o, np);
1268         if (r < 0)
1269                 return r;
1270
1271         if (ret)
1272                 *ret = o;
1273
1274         if (offset)
1275                 *offset = np;
1276
1277         return 0;
1278 }
1279
1280 void journal_file_post_change(JournalFile *f) {
1281         assert(f);
1282
1283         /* inotify() does not receive IN_MODIFY events from file
1284          * accesses done via mmap(). After each access we hence
1285          * trigger IN_MODIFY by truncating the journal file to its
1286          * current size which triggers IN_MODIFY. */
1287
1288         __sync_synchronize();
1289
1290         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1291                 log_error("Failed to truncate file to its own size: %m");
1292 }
1293
1294 static int entry_item_cmp(const void *_a, const void *_b) {
1295         const EntryItem *a = _a, *b = _b;
1296
1297         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1298                 return -1;
1299         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1300                 return 1;
1301         return 0;
1302 }
1303
1304 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1305         unsigned i;
1306         EntryItem *items;
1307         int r;
1308         uint64_t xor_hash = 0;
1309         struct dual_timestamp _ts;
1310
1311         assert(f);
1312         assert(iovec || n_iovec == 0);
1313
1314         if (!ts) {
1315                 dual_timestamp_get(&_ts);
1316                 ts = &_ts;
1317         }
1318
1319         if (f->tail_entry_monotonic_valid &&
1320             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1321                 return -EINVAL;
1322
1323 #ifdef HAVE_GCRYPT
1324         r = journal_file_maybe_append_tag(f, ts->realtime);
1325         if (r < 0)
1326                 return r;
1327 #endif
1328
1329         /* alloca() can't take 0, hence let's allocate at least one */
1330         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1331
1332         for (i = 0; i < n_iovec; i++) {
1333                 uint64_t p;
1334                 Object *o;
1335
1336                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1337                 if (r < 0)
1338                         return r;
1339
1340                 xor_hash ^= le64toh(o->data.hash);
1341                 items[i].object_offset = htole64(p);
1342                 items[i].hash = o->data.hash;
1343         }
1344
1345         /* Order by the position on disk, in order to improve seek
1346          * times for rotating media. */
1347         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1348
1349         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1350
1351         journal_file_post_change(f);
1352
1353         return r;
1354 }
1355
1356 typedef struct ChainCacheItem {
1357         uint64_t first; /* the array at the begin of the chain */
1358         uint64_t array; /* the cached array */
1359         uint64_t begin; /* the first item in the cached array */
1360         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1361 } ChainCacheItem;
1362
1363 static void chain_cache_put(
1364                 Hashmap *h,
1365                 ChainCacheItem *ci,
1366                 uint64_t first,
1367                 uint64_t array,
1368                 uint64_t begin,
1369                 uint64_t total) {
1370
1371         if (!ci) {
1372                 /* If the chain item to cache for this chain is the
1373                  * first one it's not worth caching anything */
1374                 if (array == first)
1375                         return;
1376
1377                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1378                         ci = hashmap_steal_first(h);
1379                 else {
1380                         ci = new(ChainCacheItem, 1);
1381                         if (!ci)
1382                                 return;
1383                 }
1384
1385                 ci->first = first;
1386
1387                 if (hashmap_put(h, &ci->first, ci) < 0) {
1388                         free(ci);
1389                         return;
1390                 }
1391         } else
1392                 assert(ci->first == first);
1393
1394         ci->array = array;
1395         ci->begin = begin;
1396         ci->total = total;
1397 }
1398
1399 static int generic_array_get(JournalFile *f,
1400                              uint64_t first,
1401                              uint64_t i,
1402                              Object **ret, uint64_t *offset) {
1403
1404         Object *o;
1405         uint64_t p = 0, a, t = 0;
1406         int r;
1407         ChainCacheItem *ci;
1408
1409         assert(f);
1410
1411         a = first;
1412
1413         /* Try the chain cache first */
1414         ci = hashmap_get(f->chain_cache, &first);
1415         if (ci && i > ci->total) {
1416                 a = ci->array;
1417                 i -= ci->total;
1418                 t = ci->total;
1419         }
1420
1421         while (a > 0) {
1422                 uint64_t k;
1423
1424                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1425                 if (r < 0)
1426                         return r;
1427
1428                 k = journal_file_entry_array_n_items(o);
1429                 if (i < k) {
1430                         p = le64toh(o->entry_array.items[i]);
1431                         goto found;
1432                 }
1433
1434                 i -= k;
1435                 t += k;
1436                 a = le64toh(o->entry_array.next_entry_array_offset);
1437         }
1438
1439         return 0;
1440
1441 found:
1442         /* Let's cache this item for the next invocation */
1443         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1444
1445         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1446         if (r < 0)
1447                 return r;
1448
1449         if (ret)
1450                 *ret = o;
1451
1452         if (offset)
1453                 *offset = p;
1454
1455         return 1;
1456 }
1457
1458 static int generic_array_get_plus_one(JournalFile *f,
1459                                       uint64_t extra,
1460                                       uint64_t first,
1461                                       uint64_t i,
1462                                       Object **ret, uint64_t *offset) {
1463
1464         Object *o;
1465
1466         assert(f);
1467
1468         if (i == 0) {
1469                 int r;
1470
1471                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1472                 if (r < 0)
1473                         return r;
1474
1475                 if (ret)
1476                         *ret = o;
1477
1478                 if (offset)
1479                         *offset = extra;
1480
1481                 return 1;
1482         }
1483
1484         return generic_array_get(f, first, i-1, ret, offset);
1485 }
1486
1487 enum {
1488         TEST_FOUND,
1489         TEST_LEFT,
1490         TEST_RIGHT
1491 };
1492
1493 static int generic_array_bisect(JournalFile *f,
1494                                 uint64_t first,
1495                                 uint64_t n,
1496                                 uint64_t needle,
1497                                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1498                                 direction_t direction,
1499                                 Object **ret,
1500                                 uint64_t *offset,
1501                                 uint64_t *idx) {
1502
1503         uint64_t a, p, t = 0, i = 0, last_p = 0;
1504         bool subtract_one = false;
1505         Object *o, *array = NULL;
1506         int r;
1507         ChainCacheItem *ci;
1508
1509         assert(f);
1510         assert(test_object);
1511
1512         /* Start with the first array in the chain */
1513         a = first;
1514
1515         ci = hashmap_get(f->chain_cache, &first);
1516         if (ci && n > ci->total) {
1517                 /* Ah, we have iterated this bisection array chain
1518                  * previously! Let's see if we can skip ahead in the
1519                  * chain, as far as the last time. But we can't jump
1520                  * backwards in the chain, so let's check that
1521                  * first. */
1522
1523                 r = test_object(f, ci->begin, needle);
1524                 if (r < 0)
1525                         return r;
1526
1527                 if (r == TEST_LEFT) {
1528                         /* OK, what we are looking for is right of th
1529                          * begin of this EntryArray, so let's jump
1530                          * straight to previously cached array in the
1531                          * chain */
1532
1533                         a = ci->array;
1534                         n -= ci->total;
1535                         t = ci->total;
1536                 }
1537         }
1538
1539         while (a > 0) {
1540                 uint64_t left, right, k, lp;
1541
1542                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1543                 if (r < 0)
1544                         return r;
1545
1546                 k = journal_file_entry_array_n_items(array);
1547                 right = MIN(k, n);
1548                 if (right <= 0)
1549                         return 0;
1550
1551                 i = right - 1;
1552                 lp = p = le64toh(array->entry_array.items[i]);
1553                 if (p <= 0)
1554                         return -EBADMSG;
1555
1556                 r = test_object(f, p, needle);
1557                 if (r < 0)
1558                         return r;
1559
1560                 if (r == TEST_FOUND)
1561                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1562
1563                 if (r == TEST_RIGHT) {
1564                         left = 0;
1565                         right -= 1;
1566                         for (;;) {
1567                                 if (left == right) {
1568                                         if (direction == DIRECTION_UP)
1569                                                 subtract_one = true;
1570
1571                                         i = left;
1572                                         goto found;
1573                                 }
1574
1575                                 assert(left < right);
1576
1577                                 i = (left + right) / 2;
1578                                 p = le64toh(array->entry_array.items[i]);
1579                                 if (p <= 0)
1580                                         return -EBADMSG;
1581
1582                                 r = test_object(f, p, needle);
1583                                 if (r < 0)
1584                                         return r;
1585
1586                                 if (r == TEST_FOUND)
1587                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1588
1589                                 if (r == TEST_RIGHT)
1590                                         right = i;
1591                                 else
1592                                         left = i + 1;
1593                         }
1594                 }
1595
1596                 if (k > n) {
1597                         if (direction == DIRECTION_UP) {
1598                                 i = n;
1599                                 subtract_one = true;
1600                                 goto found;
1601                         }
1602
1603                         return 0;
1604                 }
1605
1606                 last_p = lp;
1607
1608                 n -= k;
1609                 t += k;
1610                 a = le64toh(array->entry_array.next_entry_array_offset);
1611         }
1612
1613         return 0;
1614
1615 found:
1616         if (subtract_one && t == 0 && i == 0)
1617                 return 0;
1618
1619         /* Let's cache this item for the next invocation */
1620         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1621
1622         if (subtract_one && i == 0)
1623                 p = last_p;
1624         else if (subtract_one)
1625                 p = le64toh(array->entry_array.items[i-1]);
1626         else
1627                 p = le64toh(array->entry_array.items[i]);
1628
1629         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1630         if (r < 0)
1631                 return r;
1632
1633         if (ret)
1634                 *ret = o;
1635
1636         if (offset)
1637                 *offset = p;
1638
1639         if (idx)
1640                 *idx = t + i + (subtract_one ? -1 : 0);
1641
1642         return 1;
1643 }
1644
1645 static int generic_array_bisect_plus_one(JournalFile *f,
1646                                          uint64_t extra,
1647                                          uint64_t first,
1648                                          uint64_t n,
1649                                          uint64_t needle,
1650                                          int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1651                                          direction_t direction,
1652                                          Object **ret,
1653                                          uint64_t *offset,
1654                                          uint64_t *idx) {
1655
1656         int r;
1657         bool step_back = false;
1658         Object *o;
1659
1660         assert(f);
1661         assert(test_object);
1662
1663         if (n <= 0)
1664                 return 0;
1665
1666         /* This bisects the array in object 'first', but first checks
1667          * an extra  */
1668         r = test_object(f, extra, needle);
1669         if (r < 0)
1670                 return r;
1671
1672         if (r == TEST_FOUND)
1673                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1674
1675         /* if we are looking with DIRECTION_UP then we need to first
1676            see if in the actual array there is a matching entry, and
1677            return the last one of that. But if there isn't any we need
1678            to return this one. Hence remember this, and return it
1679            below. */
1680         if (r == TEST_LEFT)
1681                 step_back = direction == DIRECTION_UP;
1682
1683         if (r == TEST_RIGHT) {
1684                 if (direction == DIRECTION_DOWN)
1685                         goto found;
1686                 else
1687                         return 0;
1688         }
1689
1690         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1691
1692         if (r == 0 && step_back)
1693                 goto found;
1694
1695         if (r > 0 && idx)
1696                 (*idx) ++;
1697
1698         return r;
1699
1700 found:
1701         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1702         if (r < 0)
1703                 return r;
1704
1705         if (ret)
1706                 *ret = o;
1707
1708         if (offset)
1709                 *offset = extra;
1710
1711         if (idx)
1712                 *idx = 0;
1713
1714         return 1;
1715 }
1716
1717 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1718         assert(f);
1719         assert(p > 0);
1720
1721         if (p == needle)
1722                 return TEST_FOUND;
1723         else if (p < needle)
1724                 return TEST_LEFT;
1725         else
1726                 return TEST_RIGHT;
1727 }
1728
1729 int journal_file_move_to_entry_by_offset(
1730                 JournalFile *f,
1731                 uint64_t p,
1732                 direction_t direction,
1733                 Object **ret,
1734                 uint64_t *offset) {
1735
1736         return generic_array_bisect(f,
1737                                     le64toh(f->header->entry_array_offset),
1738                                     le64toh(f->header->n_entries),
1739                                     p,
1740                                     test_object_offset,
1741                                     direction,
1742                                     ret, offset, NULL);
1743 }
1744
1745
1746 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1747         Object *o;
1748         int r;
1749
1750         assert(f);
1751         assert(p > 0);
1752
1753         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1754         if (r < 0)
1755                 return r;
1756
1757         if (le64toh(o->entry.seqnum) == needle)
1758                 return TEST_FOUND;
1759         else if (le64toh(o->entry.seqnum) < needle)
1760                 return TEST_LEFT;
1761         else
1762                 return TEST_RIGHT;
1763 }
1764
1765 int journal_file_move_to_entry_by_seqnum(
1766                 JournalFile *f,
1767                 uint64_t seqnum,
1768                 direction_t direction,
1769                 Object **ret,
1770                 uint64_t *offset) {
1771
1772         return generic_array_bisect(f,
1773                                     le64toh(f->header->entry_array_offset),
1774                                     le64toh(f->header->n_entries),
1775                                     seqnum,
1776                                     test_object_seqnum,
1777                                     direction,
1778                                     ret, offset, NULL);
1779 }
1780
1781 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1782         Object *o;
1783         int r;
1784
1785         assert(f);
1786         assert(p > 0);
1787
1788         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1789         if (r < 0)
1790                 return r;
1791
1792         if (le64toh(o->entry.realtime) == needle)
1793                 return TEST_FOUND;
1794         else if (le64toh(o->entry.realtime) < needle)
1795                 return TEST_LEFT;
1796         else
1797                 return TEST_RIGHT;
1798 }
1799
1800 int journal_file_move_to_entry_by_realtime(
1801                 JournalFile *f,
1802                 uint64_t realtime,
1803                 direction_t direction,
1804                 Object **ret,
1805                 uint64_t *offset) {
1806
1807         return generic_array_bisect(f,
1808                                     le64toh(f->header->entry_array_offset),
1809                                     le64toh(f->header->n_entries),
1810                                     realtime,
1811                                     test_object_realtime,
1812                                     direction,
1813                                     ret, offset, NULL);
1814 }
1815
1816 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1817         Object *o;
1818         int r;
1819
1820         assert(f);
1821         assert(p > 0);
1822
1823         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1824         if (r < 0)
1825                 return r;
1826
1827         if (le64toh(o->entry.monotonic) == needle)
1828                 return TEST_FOUND;
1829         else if (le64toh(o->entry.monotonic) < needle)
1830                 return TEST_LEFT;
1831         else
1832                 return TEST_RIGHT;
1833 }
1834
1835 static inline int find_data_object_by_boot_id(
1836                 JournalFile *f,
1837                 sd_id128_t boot_id,
1838                 Object **o,
1839                 uint64_t *b) {
1840         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1841
1842         sd_id128_to_string(boot_id, t + 9);
1843         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1844 }
1845
1846 int journal_file_move_to_entry_by_monotonic(
1847                 JournalFile *f,
1848                 sd_id128_t boot_id,
1849                 uint64_t monotonic,
1850                 direction_t direction,
1851                 Object **ret,
1852                 uint64_t *offset) {
1853
1854         Object *o;
1855         int r;
1856
1857         assert(f);
1858
1859         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1860         if (r < 0)
1861                 return r;
1862         if (r == 0)
1863                 return -ENOENT;
1864
1865         return generic_array_bisect_plus_one(f,
1866                                              le64toh(o->data.entry_offset),
1867                                              le64toh(o->data.entry_array_offset),
1868                                              le64toh(o->data.n_entries),
1869                                              monotonic,
1870                                              test_object_monotonic,
1871                                              direction,
1872                                              ret, offset, NULL);
1873 }
1874
1875 int journal_file_next_entry(
1876                 JournalFile *f,
1877                 Object *o, uint64_t p,
1878                 direction_t direction,
1879                 Object **ret, uint64_t *offset) {
1880
1881         uint64_t i, n;
1882         int r;
1883
1884         assert(f);
1885         assert(p > 0 || !o);
1886
1887         n = le64toh(f->header->n_entries);
1888         if (n <= 0)
1889                 return 0;
1890
1891         if (!o)
1892                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1893         else {
1894                 if (o->object.type != OBJECT_ENTRY)
1895                         return -EINVAL;
1896
1897                 r = generic_array_bisect(f,
1898                                          le64toh(f->header->entry_array_offset),
1899                                          le64toh(f->header->n_entries),
1900                                          p,
1901                                          test_object_offset,
1902                                          DIRECTION_DOWN,
1903                                          NULL, NULL,
1904                                          &i);
1905                 if (r <= 0)
1906                         return r;
1907
1908                 if (direction == DIRECTION_DOWN) {
1909                         if (i >= n - 1)
1910                                 return 0;
1911
1912                         i++;
1913                 } else {
1914                         if (i <= 0)
1915                                 return 0;
1916
1917                         i--;
1918                 }
1919         }
1920
1921         /* And jump to it */
1922         return generic_array_get(f,
1923                                  le64toh(f->header->entry_array_offset),
1924                                  i,
1925                                  ret, offset);
1926 }
1927
1928 int journal_file_skip_entry(
1929                 JournalFile *f,
1930                 Object *o, uint64_t p,
1931                 int64_t skip,
1932                 Object **ret, uint64_t *offset) {
1933
1934         uint64_t i, n;
1935         int r;
1936
1937         assert(f);
1938         assert(o);
1939         assert(p > 0);
1940
1941         if (o->object.type != OBJECT_ENTRY)
1942                 return -EINVAL;
1943
1944         r = generic_array_bisect(f,
1945                                  le64toh(f->header->entry_array_offset),
1946                                  le64toh(f->header->n_entries),
1947                                  p,
1948                                  test_object_offset,
1949                                  DIRECTION_DOWN,
1950                                  NULL, NULL,
1951                                  &i);
1952         if (r <= 0)
1953                 return r;
1954
1955         /* Calculate new index */
1956         if (skip < 0) {
1957                 if ((uint64_t) -skip >= i)
1958                         i = 0;
1959                 else
1960                         i = i - (uint64_t) -skip;
1961         } else
1962                 i  += (uint64_t) skip;
1963
1964         n = le64toh(f->header->n_entries);
1965         if (n <= 0)
1966                 return -EBADMSG;
1967
1968         if (i >= n)
1969                 i = n-1;
1970
1971         return generic_array_get(f,
1972                                  le64toh(f->header->entry_array_offset),
1973                                  i,
1974                                  ret, offset);
1975 }
1976
1977 int journal_file_next_entry_for_data(
1978                 JournalFile *f,
1979                 Object *o, uint64_t p,
1980                 uint64_t data_offset,
1981                 direction_t direction,
1982                 Object **ret, uint64_t *offset) {
1983
1984         uint64_t n, i;
1985         int r;
1986         Object *d;
1987
1988         assert(f);
1989         assert(p > 0 || !o);
1990
1991         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1992         if (r < 0)
1993                 return r;
1994
1995         n = le64toh(d->data.n_entries);
1996         if (n <= 0)
1997                 return n;
1998
1999         if (!o)
2000                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2001         else {
2002                 if (o->object.type != OBJECT_ENTRY)
2003                         return -EINVAL;
2004
2005                 r = generic_array_bisect_plus_one(f,
2006                                                   le64toh(d->data.entry_offset),
2007                                                   le64toh(d->data.entry_array_offset),
2008                                                   le64toh(d->data.n_entries),
2009                                                   p,
2010                                                   test_object_offset,
2011                                                   DIRECTION_DOWN,
2012                                                   NULL, NULL,
2013                                                   &i);
2014
2015                 if (r <= 0)
2016                         return r;
2017
2018                 if (direction == DIRECTION_DOWN) {
2019                         if (i >= n - 1)
2020                                 return 0;
2021
2022                         i++;
2023                 } else {
2024                         if (i <= 0)
2025                                 return 0;
2026
2027                         i--;
2028                 }
2029
2030         }
2031
2032         return generic_array_get_plus_one(f,
2033                                           le64toh(d->data.entry_offset),
2034                                           le64toh(d->data.entry_array_offset),
2035                                           i,
2036                                           ret, offset);
2037 }
2038
2039 int journal_file_move_to_entry_by_offset_for_data(
2040                 JournalFile *f,
2041                 uint64_t data_offset,
2042                 uint64_t p,
2043                 direction_t direction,
2044                 Object **ret, uint64_t *offset) {
2045
2046         int r;
2047         Object *d;
2048
2049         assert(f);
2050
2051         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2052         if (r < 0)
2053                 return r;
2054
2055         return generic_array_bisect_plus_one(f,
2056                                              le64toh(d->data.entry_offset),
2057                                              le64toh(d->data.entry_array_offset),
2058                                              le64toh(d->data.n_entries),
2059                                              p,
2060                                              test_object_offset,
2061                                              direction,
2062                                              ret, offset, NULL);
2063 }
2064
2065 int journal_file_move_to_entry_by_monotonic_for_data(
2066                 JournalFile *f,
2067                 uint64_t data_offset,
2068                 sd_id128_t boot_id,
2069                 uint64_t monotonic,
2070                 direction_t direction,
2071                 Object **ret, uint64_t *offset) {
2072
2073         Object *o, *d;
2074         int r;
2075         uint64_t b, z;
2076
2077         assert(f);
2078
2079         /* First, seek by time */
2080         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2081         if (r < 0)
2082                 return r;
2083         if (r == 0)
2084                 return -ENOENT;
2085
2086         r = generic_array_bisect_plus_one(f,
2087                                           le64toh(o->data.entry_offset),
2088                                           le64toh(o->data.entry_array_offset),
2089                                           le64toh(o->data.n_entries),
2090                                           monotonic,
2091                                           test_object_monotonic,
2092                                           direction,
2093                                           NULL, &z, NULL);
2094         if (r <= 0)
2095                 return r;
2096
2097         /* And now, continue seeking until we find an entry that
2098          * exists in both bisection arrays */
2099
2100         for (;;) {
2101                 Object *qo;
2102                 uint64_t p, q;
2103
2104                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2105                 if (r < 0)
2106                         return r;
2107
2108                 r = generic_array_bisect_plus_one(f,
2109                                                   le64toh(d->data.entry_offset),
2110                                                   le64toh(d->data.entry_array_offset),
2111                                                   le64toh(d->data.n_entries),
2112                                                   z,
2113                                                   test_object_offset,
2114                                                   direction,
2115                                                   NULL, &p, NULL);
2116                 if (r <= 0)
2117                         return r;
2118
2119                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2120                 if (r < 0)
2121                         return r;
2122
2123                 r = generic_array_bisect_plus_one(f,
2124                                                   le64toh(o->data.entry_offset),
2125                                                   le64toh(o->data.entry_array_offset),
2126                                                   le64toh(o->data.n_entries),
2127                                                   p,
2128                                                   test_object_offset,
2129                                                   direction,
2130                                                   &qo, &q, NULL);
2131
2132                 if (r <= 0)
2133                         return r;
2134
2135                 if (p == q) {
2136                         if (ret)
2137                                 *ret = qo;
2138                         if (offset)
2139                                 *offset = q;
2140
2141                         return 1;
2142                 }
2143
2144                 z = q;
2145         }
2146
2147         return 0;
2148 }
2149
2150 int journal_file_move_to_entry_by_seqnum_for_data(
2151                 JournalFile *f,
2152                 uint64_t data_offset,
2153                 uint64_t seqnum,
2154                 direction_t direction,
2155                 Object **ret, uint64_t *offset) {
2156
2157         Object *d;
2158         int r;
2159
2160         assert(f);
2161
2162         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2163         if (r < 0)
2164                 return r;
2165
2166         return generic_array_bisect_plus_one(f,
2167                                              le64toh(d->data.entry_offset),
2168                                              le64toh(d->data.entry_array_offset),
2169                                              le64toh(d->data.n_entries),
2170                                              seqnum,
2171                                              test_object_seqnum,
2172                                              direction,
2173                                              ret, offset, NULL);
2174 }
2175
2176 int journal_file_move_to_entry_by_realtime_for_data(
2177                 JournalFile *f,
2178                 uint64_t data_offset,
2179                 uint64_t realtime,
2180                 direction_t direction,
2181                 Object **ret, uint64_t *offset) {
2182
2183         Object *d;
2184         int r;
2185
2186         assert(f);
2187
2188         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2189         if (r < 0)
2190                 return r;
2191
2192         return generic_array_bisect_plus_one(f,
2193                                              le64toh(d->data.entry_offset),
2194                                              le64toh(d->data.entry_array_offset),
2195                                              le64toh(d->data.n_entries),
2196                                              realtime,
2197                                              test_object_realtime,
2198                                              direction,
2199                                              ret, offset, NULL);
2200 }
2201
2202 void journal_file_dump(JournalFile *f) {
2203         Object *o;
2204         int r;
2205         uint64_t p;
2206
2207         assert(f);
2208
2209         journal_file_print_header(f);
2210
2211         p = le64toh(f->header->header_size);
2212         while (p != 0) {
2213                 r = journal_file_move_to_object(f, -1, p, &o);
2214                 if (r < 0)
2215                         goto fail;
2216
2217                 switch (o->object.type) {
2218
2219                 case OBJECT_UNUSED:
2220                         printf("Type: OBJECT_UNUSED\n");
2221                         break;
2222
2223                 case OBJECT_DATA:
2224                         printf("Type: OBJECT_DATA\n");
2225                         break;
2226
2227                 case OBJECT_FIELD:
2228                         printf("Type: OBJECT_FIELD\n");
2229                         break;
2230
2231                 case OBJECT_ENTRY:
2232                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2233                                le64toh(o->entry.seqnum),
2234                                le64toh(o->entry.monotonic),
2235                                le64toh(o->entry.realtime));
2236                         break;
2237
2238                 case OBJECT_FIELD_HASH_TABLE:
2239                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2240                         break;
2241
2242                 case OBJECT_DATA_HASH_TABLE:
2243                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2244                         break;
2245
2246                 case OBJECT_ENTRY_ARRAY:
2247                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2248                         break;
2249
2250                 case OBJECT_TAG:
2251                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2252                                le64toh(o->tag.seqnum),
2253                                le64toh(o->tag.epoch));
2254                         break;
2255
2256                 default:
2257                         printf("Type: unknown (%u)\n", o->object.type);
2258                         break;
2259                 }
2260
2261                 if (o->object.flags & OBJECT_COMPRESSED)
2262                         printf("Flags: COMPRESSED\n");
2263
2264                 if (p == le64toh(f->header->tail_object_offset))
2265                         p = 0;
2266                 else
2267                         p = p + ALIGN64(le64toh(o->object.size));
2268         }
2269
2270         return;
2271 fail:
2272         log_error("File corrupt");
2273 }
2274
2275 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2276         const char *x;
2277
2278         x = format_timestamp(buf, l, t);
2279         if (x)
2280                 return x;
2281         return " --- ";
2282 }
2283
2284 void journal_file_print_header(JournalFile *f) {
2285         char a[33], b[33], c[33], d[33];
2286         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2287         struct stat st;
2288         char bytes[FORMAT_BYTES_MAX];
2289
2290         assert(f);
2291
2292         printf("File Path: %s\n"
2293                "File ID: %s\n"
2294                "Machine ID: %s\n"
2295                "Boot ID: %s\n"
2296                "Sequential Number ID: %s\n"
2297                "State: %s\n"
2298                "Compatible Flags:%s%s\n"
2299                "Incompatible Flags:%s%s\n"
2300                "Header size: %"PRIu64"\n"
2301                "Arena size: %"PRIu64"\n"
2302                "Data Hash Table Size: %"PRIu64"\n"
2303                "Field Hash Table Size: %"PRIu64"\n"
2304                "Rotate Suggested: %s\n"
2305                "Head Sequential Number: %"PRIu64"\n"
2306                "Tail Sequential Number: %"PRIu64"\n"
2307                "Head Realtime Timestamp: %s\n"
2308                "Tail Realtime Timestamp: %s\n"
2309                "Tail Monotonic Timestamp: %s\n"
2310                "Objects: %"PRIu64"\n"
2311                "Entry Objects: %"PRIu64"\n",
2312                f->path,
2313                sd_id128_to_string(f->header->file_id, a),
2314                sd_id128_to_string(f->header->machine_id, b),
2315                sd_id128_to_string(f->header->boot_id, c),
2316                sd_id128_to_string(f->header->seqnum_id, d),
2317                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2318                f->header->state == STATE_ONLINE ? "ONLINE" :
2319                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2320                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2321                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2322                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2323                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2324                le64toh(f->header->header_size),
2325                le64toh(f->header->arena_size),
2326                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2327                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2328                yes_no(journal_file_rotate_suggested(f, 0)),
2329                le64toh(f->header->head_entry_seqnum),
2330                le64toh(f->header->tail_entry_seqnum),
2331                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2332                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2333                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2334                le64toh(f->header->n_objects),
2335                le64toh(f->header->n_entries));
2336
2337         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2338                 printf("Data Objects: %"PRIu64"\n"
2339                        "Data Hash Table Fill: %.1f%%\n",
2340                        le64toh(f->header->n_data),
2341                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2342
2343         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2344                 printf("Field Objects: %"PRIu64"\n"
2345                        "Field Hash Table Fill: %.1f%%\n",
2346                        le64toh(f->header->n_fields),
2347                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2348
2349         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2350                 printf("Tag Objects: %"PRIu64"\n",
2351                        le64toh(f->header->n_tags));
2352         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2353                 printf("Entry Array Objects: %"PRIu64"\n",
2354                        le64toh(f->header->n_entry_arrays));
2355
2356         if (fstat(f->fd, &st) >= 0)
2357                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2358 }
2359
2360 int journal_file_open(
2361                 const char *fname,
2362                 int flags,
2363                 mode_t mode,
2364                 bool compress,
2365                 bool seal,
2366                 JournalMetrics *metrics,
2367                 MMapCache *mmap_cache,
2368                 JournalFile *template,
2369                 JournalFile **ret) {
2370
2371         JournalFile *f;
2372         int r;
2373         bool newly_created = false;
2374
2375         assert(fname);
2376         assert(ret);
2377
2378         if ((flags & O_ACCMODE) != O_RDONLY &&
2379             (flags & O_ACCMODE) != O_RDWR)
2380                 return -EINVAL;
2381
2382         if (!endswith(fname, ".journal") &&
2383             !endswith(fname, ".journal~"))
2384                 return -EINVAL;
2385
2386         f = new0(JournalFile, 1);
2387         if (!f)
2388                 return -ENOMEM;
2389
2390         f->fd = -1;
2391         f->mode = mode;
2392
2393         f->flags = flags;
2394         f->prot = prot_from_flags(flags);
2395         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2396 #ifdef HAVE_XZ
2397         f->compress = compress;
2398 #endif
2399 #ifdef HAVE_GCRYPT
2400         f->seal = seal;
2401 #endif
2402
2403         if (mmap_cache)
2404                 f->mmap = mmap_cache_ref(mmap_cache);
2405         else {
2406                 f->mmap = mmap_cache_new();
2407                 if (!f->mmap) {
2408                         r = -ENOMEM;
2409                         goto fail;
2410                 }
2411         }
2412
2413         f->path = strdup(fname);
2414         if (!f->path) {
2415                 r = -ENOMEM;
2416                 goto fail;
2417         }
2418
2419         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2420         if (!f->chain_cache) {
2421                 r = -ENOMEM;
2422                 goto fail;
2423         }
2424
2425         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2426         if (f->fd < 0) {
2427                 r = -errno;
2428                 goto fail;
2429         }
2430
2431         if (fstat(f->fd, &f->last_stat) < 0) {
2432                 r = -errno;
2433                 goto fail;
2434         }
2435
2436         if (f->last_stat.st_size == 0 && f->writable) {
2437 #ifdef HAVE_XATTR
2438                 uint64_t crtime;
2439
2440                 /* Let's attach the creation time to the journal file,
2441                  * so that the vacuuming code knows the age of this
2442                  * file even if the file might end up corrupted one
2443                  * day... Ideally we'd just use the creation time many
2444                  * file systems maintain for each file, but there is
2445                  * currently no usable API to query this, hence let's
2446                  * emulate this via extended attributes. If extended
2447                  * attributes are not supported we'll just skip this,
2448                  * and rely solely on mtime/atime/ctime of the file.*/
2449
2450                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2451                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2452 #endif
2453
2454 #ifdef HAVE_GCRYPT
2455                 /* Try to load the FSPRG state, and if we can't, then
2456                  * just don't do sealing */
2457                 if (f->seal) {
2458                         r = journal_file_fss_load(f);
2459                         if (r < 0)
2460                                 f->seal = false;
2461                 }
2462 #endif
2463
2464                 r = journal_file_init_header(f, template);
2465                 if (r < 0)
2466                         goto fail;
2467
2468                 if (fstat(f->fd, &f->last_stat) < 0) {
2469                         r = -errno;
2470                         goto fail;
2471                 }
2472
2473                 newly_created = true;
2474         }
2475
2476         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2477                 r = -EIO;
2478                 goto fail;
2479         }
2480
2481         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2482         if (f->header == MAP_FAILED) {
2483                 f->header = NULL;
2484                 r = -errno;
2485                 goto fail;
2486         }
2487
2488         if (!newly_created) {
2489                 r = journal_file_verify_header(f);
2490                 if (r < 0)
2491                         goto fail;
2492         }
2493
2494 #ifdef HAVE_GCRYPT
2495         if (!newly_created && f->writable) {
2496                 r = journal_file_fss_load(f);
2497                 if (r < 0)
2498                         goto fail;
2499         }
2500 #endif
2501
2502         if (f->writable) {
2503                 if (metrics) {
2504                         journal_default_metrics(metrics, f->fd);
2505                         f->metrics = *metrics;
2506                 } else if (template)
2507                         f->metrics = template->metrics;
2508
2509                 r = journal_file_refresh_header(f);
2510                 if (r < 0)
2511                         goto fail;
2512         }
2513
2514 #ifdef HAVE_GCRYPT
2515         r = journal_file_hmac_setup(f);
2516         if (r < 0)
2517                 goto fail;
2518 #endif
2519
2520         if (newly_created) {
2521                 r = journal_file_setup_field_hash_table(f);
2522                 if (r < 0)
2523                         goto fail;
2524
2525                 r = journal_file_setup_data_hash_table(f);
2526                 if (r < 0)
2527                         goto fail;
2528
2529 #ifdef HAVE_GCRYPT
2530                 r = journal_file_append_first_tag(f);
2531                 if (r < 0)
2532                         goto fail;
2533 #endif
2534         }
2535
2536         r = journal_file_map_field_hash_table(f);
2537         if (r < 0)
2538                 goto fail;
2539
2540         r = journal_file_map_data_hash_table(f);
2541         if (r < 0)
2542                 goto fail;
2543
2544         *ret = f;
2545         return 0;
2546
2547 fail:
2548         journal_file_close(f);
2549
2550         return r;
2551 }
2552
2553 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2554         _cleanup_free_ char *p = NULL;
2555         size_t l;
2556         JournalFile *old_file, *new_file = NULL;
2557         int r;
2558
2559         assert(f);
2560         assert(*f);
2561
2562         old_file = *f;
2563
2564         if (!old_file->writable)
2565                 return -EINVAL;
2566
2567         if (!endswith(old_file->path, ".journal"))
2568                 return -EINVAL;
2569
2570         l = strlen(old_file->path);
2571         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2572                      (int) l - 8, old_file->path,
2573                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2574                      le64toh((*f)->header->head_entry_seqnum),
2575                      le64toh((*f)->header->head_entry_realtime));
2576         if (r < 0)
2577                 return -ENOMEM;
2578
2579         r = rename(old_file->path, p);
2580         if (r < 0)
2581                 return -errno;
2582
2583         old_file->header->state = STATE_ARCHIVED;
2584
2585         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2586         journal_file_close(old_file);
2587
2588         *f = new_file;
2589         return r;
2590 }
2591
2592 int journal_file_open_reliably(
2593                 const char *fname,
2594                 int flags,
2595                 mode_t mode,
2596                 bool compress,
2597                 bool seal,
2598                 JournalMetrics *metrics,
2599                 MMapCache *mmap_cache,
2600                 JournalFile *template,
2601                 JournalFile **ret) {
2602
2603         int r;
2604         size_t l;
2605         _cleanup_free_ char *p = NULL;
2606
2607         r = journal_file_open(fname, flags, mode, compress, seal,
2608                               metrics, mmap_cache, template, ret);
2609         if (r != -EBADMSG && /* corrupted */
2610             r != -ENODATA && /* truncated */
2611             r != -EHOSTDOWN && /* other machine */
2612             r != -EPROTONOSUPPORT && /* incompatible feature */
2613             r != -EBUSY && /* unclean shutdown */
2614             r != -ESHUTDOWN /* already archived */)
2615                 return r;
2616
2617         if ((flags & O_ACCMODE) == O_RDONLY)
2618                 return r;
2619
2620         if (!(flags & O_CREAT))
2621                 return r;
2622
2623         if (!endswith(fname, ".journal"))
2624                 return r;
2625
2626         /* The file is corrupted. Rotate it away and try it again (but only once) */
2627
2628         l = strlen(fname);
2629         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2630                      (int) l - 8, fname,
2631                      (unsigned long long) now(CLOCK_REALTIME),
2632                      random_ull()) < 0)
2633                 return -ENOMEM;
2634
2635         r = rename(fname, p);
2636         if (r < 0)
2637                 return -errno;
2638
2639         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2640
2641         return journal_file_open(fname, flags, mode, compress, seal,
2642                                  metrics, mmap_cache, template, ret);
2643 }
2644
2645 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2646         uint64_t i, n;
2647         uint64_t q, xor_hash = 0;
2648         int r;
2649         EntryItem *items;
2650         dual_timestamp ts;
2651
2652         assert(from);
2653         assert(to);
2654         assert(o);
2655         assert(p);
2656
2657         if (!to->writable)
2658                 return -EPERM;
2659
2660         ts.monotonic = le64toh(o->entry.monotonic);
2661         ts.realtime = le64toh(o->entry.realtime);
2662
2663         if (to->tail_entry_monotonic_valid &&
2664             ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2665                 return -EINVAL;
2666
2667         n = journal_file_entry_n_items(o);
2668         items = alloca(sizeof(EntryItem) * n);
2669
2670         for (i = 0; i < n; i++) {
2671                 uint64_t l, h;
2672                 le64_t le_hash;
2673                 size_t t;
2674                 void *data;
2675                 Object *u;
2676
2677                 q = le64toh(o->entry.items[i].object_offset);
2678                 le_hash = o->entry.items[i].hash;
2679
2680                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2681                 if (r < 0)
2682                         return r;
2683
2684                 if (le_hash != o->data.hash)
2685                         return -EBADMSG;
2686
2687                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2688                 t = (size_t) l;
2689
2690                 /* We hit the limit on 32bit machines */
2691                 if ((uint64_t) t != l)
2692                         return -E2BIG;
2693
2694                 if (o->object.flags & OBJECT_COMPRESSED) {
2695 #ifdef HAVE_XZ
2696                         uint64_t rsize;
2697
2698                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2699                                 return -EBADMSG;
2700
2701                         data = from->compress_buffer;
2702                         l = rsize;
2703 #else
2704                         return -EPROTONOSUPPORT;
2705 #endif
2706                 } else
2707                         data = o->data.payload;
2708
2709                 r = journal_file_append_data(to, data, l, &u, &h);
2710                 if (r < 0)
2711                         return r;
2712
2713                 xor_hash ^= le64toh(u->data.hash);
2714                 items[i].object_offset = htole64(h);
2715                 items[i].hash = u->data.hash;
2716
2717                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2718                 if (r < 0)
2719                         return r;
2720         }
2721
2722         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2723 }
2724
2725 void journal_default_metrics(JournalMetrics *m, int fd) {
2726         uint64_t fs_size = 0;
2727         struct statvfs ss;
2728         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2729
2730         assert(m);
2731         assert(fd >= 0);
2732
2733         if (fstatvfs(fd, &ss) >= 0)
2734                 fs_size = ss.f_frsize * ss.f_blocks;
2735
2736         if (m->max_use == (uint64_t) -1) {
2737
2738                 if (fs_size > 0) {
2739                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2740
2741                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2742                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2743
2744                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2745                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2746                 } else
2747                         m->max_use = DEFAULT_MAX_USE_LOWER;
2748         } else {
2749                 m->max_use = PAGE_ALIGN(m->max_use);
2750
2751                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2752                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2753         }
2754
2755         if (m->max_size == (uint64_t) -1) {
2756                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2757
2758                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2759                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2760         } else
2761                 m->max_size = PAGE_ALIGN(m->max_size);
2762
2763         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2764                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2765
2766         if (m->max_size*2 > m->max_use)
2767                 m->max_use = m->max_size*2;
2768
2769         if (m->min_size == (uint64_t) -1)
2770                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2771         else {
2772                 m->min_size = PAGE_ALIGN(m->min_size);
2773
2774                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2775                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2776
2777                 if (m->min_size > m->max_size)
2778                         m->max_size = m->min_size;
2779         }
2780
2781         if (m->keep_free == (uint64_t) -1) {
2782
2783                 if (fs_size > 0) {
2784                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2785
2786                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2787                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2788
2789                 } else
2790                         m->keep_free = DEFAULT_KEEP_FREE;
2791         }
2792
2793         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2794                   format_bytes(a, sizeof(a), m->max_use),
2795                   format_bytes(b, sizeof(b), m->max_size),
2796                   format_bytes(c, sizeof(c), m->min_size),
2797                   format_bytes(d, sizeof(d), m->keep_free));
2798 }
2799
2800 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2801         assert(f);
2802         assert(from || to);
2803
2804         if (from) {
2805                 if (f->header->head_entry_realtime == 0)
2806                         return -ENOENT;
2807
2808                 *from = le64toh(f->header->head_entry_realtime);
2809         }
2810
2811         if (to) {
2812                 if (f->header->tail_entry_realtime == 0)
2813                         return -ENOENT;
2814
2815                 *to = le64toh(f->header->tail_entry_realtime);
2816         }
2817
2818         return 1;
2819 }
2820
2821 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2822         Object *o;
2823         uint64_t p;
2824         int r;
2825
2826         assert(f);
2827         assert(from || to);
2828
2829         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2830         if (r <= 0)
2831                 return r;
2832
2833         if (le64toh(o->data.n_entries) <= 0)
2834                 return 0;
2835
2836         if (from) {
2837                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2838                 if (r < 0)
2839                         return r;
2840
2841                 *from = le64toh(o->entry.monotonic);
2842         }
2843
2844         if (to) {
2845                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2846                 if (r < 0)
2847                         return r;
2848
2849                 r = generic_array_get_plus_one(f,
2850                                                le64toh(o->data.entry_offset),
2851                                                le64toh(o->data.entry_array_offset),
2852                                                le64toh(o->data.n_entries)-1,
2853                                                &o, NULL);
2854                 if (r <= 0)
2855                         return r;
2856
2857                 *to = le64toh(o->entry.monotonic);
2858         }
2859
2860         return 1;
2861 }
2862
2863 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2864         assert(f);
2865
2866         /* If we gained new header fields we gained new features,
2867          * hence suggest a rotation */
2868         if (le64toh(f->header->header_size) < sizeof(Header)) {
2869                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2870                 return true;
2871         }
2872
2873         /* Let's check if the hash tables grew over a certain fill
2874          * level (75%, borrowing this value from Java's hash table
2875          * implementation), and if so suggest a rotation. To calculate
2876          * the fill level we need the n_data field, which only exists
2877          * in newer versions. */
2878
2879         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2880                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2881                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2882                                   f->path,
2883                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2884                                   le64toh(f->header->n_data),
2885                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2886                                   (unsigned long long) f->last_stat.st_size,
2887                                   f->last_stat.st_size / le64toh(f->header->n_data));
2888                         return true;
2889                 }
2890
2891         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2892                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2893                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2894                                   f->path,
2895                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2896                                   le64toh(f->header->n_fields),
2897                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2898                         return true;
2899                 }
2900
2901         /* Are the data objects properly indexed by field objects? */
2902         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2903             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2904             le64toh(f->header->n_data) > 0 &&
2905             le64toh(f->header->n_fields) == 0)
2906                 return true;
2907
2908         if (max_file_usec > 0) {
2909                 usec_t t, h;
2910
2911                 h = le64toh(f->header->head_entry_realtime);
2912                 t = now(CLOCK_REALTIME);
2913
2914                 if (h > 0 && t > h + max_file_usec)
2915                         return true;
2916         }
2917
2918         return false;
2919 }