chiark / gitweb /
event: add ability to change fd of an active event source
[elogind.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #ifdef HAVE_XATTR
31 #include <attr/xattr.h>
32 #endif
33
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
37 #include "lookup3.h"
38 #include "compress.h"
39 #include "fsprg.h"
40
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
43
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
45
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL)           /* 4 MiB */
48
49 /* These are the lower and upper bounds if we deduce the max_use value
50  * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL)           /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL)   /* 4 GiB */
53
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL)        /* 128 MiB */
56
57 /* This is the upper bound if we deduce the keep_free value from the
58  * file system size */
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61 /* This is the keep_free value when we can't determine the system
62  * size */
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL)                    /* 1 MB */
64
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
67
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
70
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL)              /* 8MB */
73
74 static int journal_file_set_online(JournalFile *f) {
75         assert(f);
76
77         if (!f->writable)
78                 return -EPERM;
79
80         if (!(f->fd >= 0 && f->header))
81                 return -EINVAL;
82
83         switch(f->header->state) {
84                 case STATE_ONLINE:
85                         return 0;
86
87                 case STATE_OFFLINE:
88                         f->header->state = STATE_ONLINE;
89                         fsync(f->fd);
90                         return 0;
91
92                 default:
93                         return -EINVAL;
94         }
95 }
96
97 int journal_file_set_offline(JournalFile *f) {
98         assert(f);
99
100         if (!f->writable)
101                 return -EPERM;
102
103         if (!(f->fd >= 0 && f->header))
104                 return -EINVAL;
105
106         if (f->header->state != STATE_ONLINE)
107                 return 0;
108
109         fsync(f->fd);
110
111         f->header->state = STATE_OFFLINE;
112
113         fsync(f->fd);
114
115         return 0;
116 }
117
118 void journal_file_close(JournalFile *f) {
119         assert(f);
120
121 #ifdef HAVE_GCRYPT
122         /* Write the final tag */
123         if (f->seal && f->writable)
124                 journal_file_append_tag(f);
125 #endif
126
127         /* Sync everything to disk, before we mark the file offline */
128         if (f->mmap && f->fd >= 0)
129                 mmap_cache_close_fd(f->mmap, f->fd);
130
131         journal_file_set_offline(f);
132
133         if (f->header)
134                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
135
136         if (f->fd >= 0)
137                 close_nointr_nofail(f->fd);
138
139         free(f->path);
140
141         if (f->mmap)
142                 mmap_cache_unref(f->mmap);
143
144         hashmap_free_free(f->chain_cache);
145
146 #ifdef HAVE_XZ
147         free(f->compress_buffer);
148 #endif
149
150 #ifdef HAVE_GCRYPT
151         if (f->fss_file)
152                 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153         else if (f->fsprg_state)
154                 free(f->fsprg_state);
155
156         free(f->fsprg_seed);
157
158         if (f->hmac)
159                 gcry_md_close(f->hmac);
160 #endif
161
162         free(f);
163 }
164
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166         Header h;
167         ssize_t k;
168         int r;
169
170         assert(f);
171
172         zero(h);
173         memcpy(h.signature, HEADER_SIGNATURE, 8);
174         h.header_size = htole64(ALIGN64(sizeof(h)));
175
176         h.incompatible_flags =
177                 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179         h.compatible_flags =
180                 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
181
182         r = sd_id128_randomize(&h.file_id);
183         if (r < 0)
184                 return r;
185
186         if (template) {
187                 h.seqnum_id = template->header->seqnum_id;
188                 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
189         } else
190                 h.seqnum_id = h.file_id;
191
192         k = pwrite(f->fd, &h, sizeof(h), 0);
193         if (k < 0)
194                 return -errno;
195
196         if (k != sizeof(h))
197                 return -EIO;
198
199         return 0;
200 }
201
202 static int journal_file_refresh_header(JournalFile *f) {
203         int r;
204         sd_id128_t boot_id;
205
206         assert(f);
207
208         r = sd_id128_get_machine(&f->header->machine_id);
209         if (r < 0)
210                 return r;
211
212         r = sd_id128_get_boot(&boot_id);
213         if (r < 0)
214                 return r;
215
216         if (sd_id128_equal(boot_id, f->header->boot_id))
217                 f->tail_entry_monotonic_valid = true;
218
219         f->header->boot_id = boot_id;
220
221         journal_file_set_online(f);
222
223         /* Sync the online state to disk */
224         fsync(f->fd);
225
226         return 0;
227 }
228
229 static int journal_file_verify_header(JournalFile *f) {
230         assert(f);
231
232         if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233                 return -EBADMSG;
234
235         /* In both read and write mode we refuse to open files with
236          * incompatible flags we don't know */
237 #ifdef HAVE_XZ
238         if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239                 return -EPROTONOSUPPORT;
240 #else
241         if (f->header->incompatible_flags != 0)
242                 return -EPROTONOSUPPORT;
243 #endif
244
245         /* When open for writing we refuse to open files with
246          * compatible flags, too */
247         if (f->writable) {
248 #ifdef HAVE_GCRYPT
249                 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250                         return -EPROTONOSUPPORT;
251 #else
252                 if (f->header->compatible_flags != 0)
253                         return -EPROTONOSUPPORT;
254 #endif
255         }
256
257         if (f->header->state >= _STATE_MAX)
258                 return -EBADMSG;
259
260         /* The first addition was n_data, so check that we are at least this large */
261         if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262                 return -EBADMSG;
263
264         if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265                 return -EBADMSG;
266
267         if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268                 return -ENODATA;
269
270         if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271                 return -ENODATA;
272
273         if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274             !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275             !VALID64(le64toh(f->header->tail_object_offset)) ||
276             !VALID64(le64toh(f->header->entry_array_offset)))
277                 return -ENODATA;
278
279         if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280             le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281             le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282             le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
283                 return -ENODATA;
284
285         if (f->writable) {
286                 uint8_t state;
287                 sd_id128_t machine_id;
288                 int r;
289
290                 r = sd_id128_get_machine(&machine_id);
291                 if (r < 0)
292                         return r;
293
294                 if (!sd_id128_equal(machine_id, f->header->machine_id))
295                         return -EHOSTDOWN;
296
297                 state = f->header->state;
298
299                 if (state == STATE_ONLINE) {
300                         log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301                         return -EBUSY;
302                 } else if (state == STATE_ARCHIVED)
303                         return -ESHUTDOWN;
304                 else if (state != STATE_OFFLINE) {
305                         log_debug("Journal file %s has unknown state %u.", f->path, state);
306                         return -EBUSY;
307                 }
308         }
309
310         f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
311
312         f->seal = JOURNAL_HEADER_SEALED(f->header);
313
314         return 0;
315 }
316
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318         uint64_t old_size, new_size;
319         int r;
320
321         assert(f);
322
323         /* We assume that this file is not sparse, and we know that
324          * for sure, since we always call posix_fallocate()
325          * ourselves */
326
327         old_size =
328                 le64toh(f->header->header_size) +
329                 le64toh(f->header->arena_size);
330
331         new_size = PAGE_ALIGN(offset + size);
332         if (new_size < le64toh(f->header->header_size))
333                 new_size = le64toh(f->header->header_size);
334
335         if (new_size <= old_size)
336                 return 0;
337
338         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339                 return -E2BIG;
340
341         if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342                 struct statvfs svfs;
343
344                 if (fstatvfs(f->fd, &svfs) >= 0) {
345                         uint64_t available;
346
347                         available = svfs.f_bfree * svfs.f_bsize;
348
349                         if (available >= f->metrics.keep_free)
350                                 available -= f->metrics.keep_free;
351                         else
352                                 available = 0;
353
354                         if (new_size - old_size > available)
355                                 return -E2BIG;
356                 }
357         }
358
359         /* Increase by larger blocks at once */
360         new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361         if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362                 new_size = f->metrics.max_size;
363
364         /* Note that the glibc fallocate() fallback is very
365            inefficient, hence we try to minimize the allocation area
366            as we can. */
367         r = posix_fallocate(f->fd, old_size, new_size - old_size);
368         if (r != 0)
369                 return -r;
370
371         if (fstat(f->fd, &f->last_stat) < 0)
372                 return -errno;
373
374         f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
375
376         return 0;
377 }
378
379 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
380         assert(f);
381         assert(ret);
382
383         if (size <= 0)
384                 return -EINVAL;
385
386         /* Avoid SIGBUS on invalid accesses */
387         if (offset + size > (uint64_t) f->last_stat.st_size) {
388                 /* Hmm, out of range? Let's refresh the fstat() data
389                  * first, before we trust that check. */
390
391                 if (fstat(f->fd, &f->last_stat) < 0 ||
392                     offset + size > (uint64_t) f->last_stat.st_size)
393                         return -EADDRNOTAVAIL;
394         }
395
396         return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 }
398
399 static uint64_t minimum_header_size(Object *o) {
400
401         static const uint64_t table[] = {
402                 [OBJECT_DATA] = sizeof(DataObject),
403                 [OBJECT_FIELD] = sizeof(FieldObject),
404                 [OBJECT_ENTRY] = sizeof(EntryObject),
405                 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406                 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407                 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408                 [OBJECT_TAG] = sizeof(TagObject),
409         };
410
411         if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412                 return sizeof(ObjectHeader);
413
414         return table[o->object.type];
415 }
416
417 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
418         int r;
419         void *t;
420         Object *o;
421         uint64_t s;
422         unsigned context;
423
424         assert(f);
425         assert(ret);
426
427         /* Objects may only be located at multiple of 64 bit */
428         if (!VALID64(offset))
429                 return -EFAULT;
430
431         /* One context for each type, plus one catch-all for the rest */
432         context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
433
434         r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
435         if (r < 0)
436                 return r;
437
438         o = (Object*) t;
439         s = le64toh(o->object.size);
440
441         if (s < sizeof(ObjectHeader))
442                 return -EBADMSG;
443
444         if (o->object.type <= OBJECT_UNUSED)
445                 return -EBADMSG;
446
447         if (s < minimum_header_size(o))
448                 return -EBADMSG;
449
450         if (type > 0 && o->object.type != type)
451                 return -EBADMSG;
452
453         if (s > sizeof(ObjectHeader)) {
454                 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
455                 if (r < 0)
456                         return r;
457
458                 o = (Object*) t;
459         }
460
461         *ret = o;
462         return 0;
463 }
464
465 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
466         uint64_t r;
467
468         assert(f);
469
470         r = le64toh(f->header->tail_entry_seqnum) + 1;
471
472         if (seqnum) {
473                 /* If an external seqnum counter was passed, we update
474                  * both the local and the external one, and set it to
475                  * the maximum of both */
476
477                 if (*seqnum + 1 > r)
478                         r = *seqnum + 1;
479
480                 *seqnum = r;
481         }
482
483         f->header->tail_entry_seqnum = htole64(r);
484
485         if (f->header->head_entry_seqnum == 0)
486                 f->header->head_entry_seqnum = htole64(r);
487
488         return r;
489 }
490
491 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
492         int r;
493         uint64_t p;
494         Object *tail, *o;
495         void *t;
496
497         assert(f);
498         assert(type > 0 && type < _OBJECT_TYPE_MAX);
499         assert(size >= sizeof(ObjectHeader));
500         assert(offset);
501         assert(ret);
502
503         r = journal_file_set_online(f);
504         if (r < 0)
505                 return r;
506
507         p = le64toh(f->header->tail_object_offset);
508         if (p == 0)
509                 p = le64toh(f->header->header_size);
510         else {
511                 r = journal_file_move_to_object(f, -1, p, &tail);
512                 if (r < 0)
513                         return r;
514
515                 p += ALIGN64(le64toh(tail->object.size));
516         }
517
518         r = journal_file_allocate(f, p, size);
519         if (r < 0)
520                 return r;
521
522         r = journal_file_move_to(f, type, false, p, size, &t);
523         if (r < 0)
524                 return r;
525
526         o = (Object*) t;
527
528         zero(o->object);
529         o->object.type = type;
530         o->object.size = htole64(size);
531
532         f->header->tail_object_offset = htole64(p);
533         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
534
535         *ret = o;
536         *offset = p;
537
538         return 0;
539 }
540
541 static int journal_file_setup_data_hash_table(JournalFile *f) {
542         uint64_t s, p;
543         Object *o;
544         int r;
545
546         assert(f);
547
548         /* We estimate that we need 1 hash table entry per 768 of
549            journal file and we want to make sure we never get beyond
550            75% fill level. Calculate the hash table size for the
551            maximum file size based on these metrics. */
552
553         s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
554         if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555                 s = DEFAULT_DATA_HASH_TABLE_SIZE;
556
557         log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
558
559         r = journal_file_append_object(f,
560                                        OBJECT_DATA_HASH_TABLE,
561                                        offsetof(Object, hash_table.items) + s,
562                                        &o, &p);
563         if (r < 0)
564                 return r;
565
566         memset(o->hash_table.items, 0, s);
567
568         f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569         f->header->data_hash_table_size = htole64(s);
570
571         return 0;
572 }
573
574 static int journal_file_setup_field_hash_table(JournalFile *f) {
575         uint64_t s, p;
576         Object *o;
577         int r;
578
579         assert(f);
580
581         /* We use a fixed size hash table for the fields as this
582          * number should grow very slowly only */
583
584         s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585         r = journal_file_append_object(f,
586                                        OBJECT_FIELD_HASH_TABLE,
587                                        offsetof(Object, hash_table.items) + s,
588                                        &o, &p);
589         if (r < 0)
590                 return r;
591
592         memset(o->hash_table.items, 0, s);
593
594         f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595         f->header->field_hash_table_size = htole64(s);
596
597         return 0;
598 }
599
600 static int journal_file_map_data_hash_table(JournalFile *f) {
601         uint64_t s, p;
602         void *t;
603         int r;
604
605         assert(f);
606
607         p = le64toh(f->header->data_hash_table_offset);
608         s = le64toh(f->header->data_hash_table_size);
609
610         r = journal_file_move_to(f,
611                                  OBJECT_DATA_HASH_TABLE,
612                                  true,
613                                  p, s,
614                                  &t);
615         if (r < 0)
616                 return r;
617
618         f->data_hash_table = t;
619         return 0;
620 }
621
622 static int journal_file_map_field_hash_table(JournalFile *f) {
623         uint64_t s, p;
624         void *t;
625         int r;
626
627         assert(f);
628
629         p = le64toh(f->header->field_hash_table_offset);
630         s = le64toh(f->header->field_hash_table_size);
631
632         r = journal_file_move_to(f,
633                                  OBJECT_FIELD_HASH_TABLE,
634                                  true,
635                                  p, s,
636                                  &t);
637         if (r < 0)
638                 return r;
639
640         f->field_hash_table = t;
641         return 0;
642 }
643
644 static int journal_file_link_field(
645                 JournalFile *f,
646                 Object *o,
647                 uint64_t offset,
648                 uint64_t hash) {
649
650         uint64_t p, h;
651         int r;
652
653         assert(f);
654         assert(o);
655         assert(offset > 0);
656
657         if (o->object.type != OBJECT_FIELD)
658                 return -EINVAL;
659
660         /* This might alter the window we are looking at */
661
662         o->field.next_hash_offset = o->field.head_data_offset = 0;
663
664         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665         p = le64toh(f->field_hash_table[h].tail_hash_offset);
666         if (p == 0)
667                 f->field_hash_table[h].head_hash_offset = htole64(offset);
668         else {
669                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
670                 if (r < 0)
671                         return r;
672
673                 o->field.next_hash_offset = htole64(offset);
674         }
675
676         f->field_hash_table[h].tail_hash_offset = htole64(offset);
677
678         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679                 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
680
681         return 0;
682 }
683
684 static int journal_file_link_data(
685                 JournalFile *f,
686                 Object *o,
687                 uint64_t offset,
688                 uint64_t hash) {
689
690         uint64_t p, h;
691         int r;
692
693         assert(f);
694         assert(o);
695         assert(offset > 0);
696
697         if (o->object.type != OBJECT_DATA)
698                 return -EINVAL;
699
700         /* This might alter the window we are looking at */
701
702         o->data.next_hash_offset = o->data.next_field_offset = 0;
703         o->data.entry_offset = o->data.entry_array_offset = 0;
704         o->data.n_entries = 0;
705
706         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
707         p = le64toh(f->data_hash_table[h].tail_hash_offset);
708         if (p == 0)
709                 /* Only entry in the hash table is easy */
710                 f->data_hash_table[h].head_hash_offset = htole64(offset);
711         else {
712                 /* Move back to the previous data object, to patch in
713                  * pointer */
714
715                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
716                 if (r < 0)
717                         return r;
718
719                 o->data.next_hash_offset = htole64(offset);
720         }
721
722         f->data_hash_table[h].tail_hash_offset = htole64(offset);
723
724         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725                 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
726
727         return 0;
728 }
729
730 int journal_file_find_field_object_with_hash(
731                 JournalFile *f,
732                 const void *field, uint64_t size, uint64_t hash,
733                 Object **ret, uint64_t *offset) {
734
735         uint64_t p, osize, h;
736         int r;
737
738         assert(f);
739         assert(field && size > 0);
740
741         osize = offsetof(Object, field.payload) + size;
742
743         if (f->header->field_hash_table_size == 0)
744                 return -EBADMSG;
745
746         h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747         p = le64toh(f->field_hash_table[h].head_hash_offset);
748
749         while (p > 0) {
750                 Object *o;
751
752                 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
753                 if (r < 0)
754                         return r;
755
756                 if (le64toh(o->field.hash) == hash &&
757                     le64toh(o->object.size) == osize &&
758                     memcmp(o->field.payload, field, size) == 0) {
759
760                         if (ret)
761                                 *ret = o;
762                         if (offset)
763                                 *offset = p;
764
765                         return 1;
766                 }
767
768                 p = le64toh(o->field.next_hash_offset);
769         }
770
771         return 0;
772 }
773
774 int journal_file_find_field_object(
775                 JournalFile *f,
776                 const void *field, uint64_t size,
777                 Object **ret, uint64_t *offset) {
778
779         uint64_t hash;
780
781         assert(f);
782         assert(field && size > 0);
783
784         hash = hash64(field, size);
785
786         return journal_file_find_field_object_with_hash(f,
787                                                         field, size, hash,
788                                                         ret, offset);
789 }
790
791 int journal_file_find_data_object_with_hash(
792                 JournalFile *f,
793                 const void *data, uint64_t size, uint64_t hash,
794                 Object **ret, uint64_t *offset) {
795
796         uint64_t p, osize, h;
797         int r;
798
799         assert(f);
800         assert(data || size == 0);
801
802         osize = offsetof(Object, data.payload) + size;
803
804         if (f->header->data_hash_table_size == 0)
805                 return -EBADMSG;
806
807         h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808         p = le64toh(f->data_hash_table[h].head_hash_offset);
809
810         while (p > 0) {
811                 Object *o;
812
813                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
814                 if (r < 0)
815                         return r;
816
817                 if (le64toh(o->data.hash) != hash)
818                         goto next;
819
820                 if (o->object.flags & OBJECT_COMPRESSED) {
821 #ifdef HAVE_XZ
822                         uint64_t l, rsize;
823
824                         l = le64toh(o->object.size);
825                         if (l <= offsetof(Object, data.payload))
826                                 return -EBADMSG;
827
828                         l -= offsetof(Object, data.payload);
829
830                         if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
831                                 return -EBADMSG;
832
833                         if (rsize == size &&
834                             memcmp(f->compress_buffer, data, size) == 0) {
835
836                                 if (ret)
837                                         *ret = o;
838
839                                 if (offset)
840                                         *offset = p;
841
842                                 return 1;
843                         }
844 #else
845                         return -EPROTONOSUPPORT;
846 #endif
847
848                 } else if (le64toh(o->object.size) == osize &&
849                            memcmp(o->data.payload, data, size) == 0) {
850
851                         if (ret)
852                                 *ret = o;
853
854                         if (offset)
855                                 *offset = p;
856
857                         return 1;
858                 }
859
860         next:
861                 p = le64toh(o->data.next_hash_offset);
862         }
863
864         return 0;
865 }
866
867 int journal_file_find_data_object(
868                 JournalFile *f,
869                 const void *data, uint64_t size,
870                 Object **ret, uint64_t *offset) {
871
872         uint64_t hash;
873
874         assert(f);
875         assert(data || size == 0);
876
877         hash = hash64(data, size);
878
879         return journal_file_find_data_object_with_hash(f,
880                                                        data, size, hash,
881                                                        ret, offset);
882 }
883
884 static int journal_file_append_field(
885                 JournalFile *f,
886                 const void *field, uint64_t size,
887                 Object **ret, uint64_t *offset) {
888
889         uint64_t hash, p;
890         uint64_t osize;
891         Object *o;
892         int r;
893
894         assert(f);
895         assert(field && size > 0);
896
897         hash = hash64(field, size);
898
899         r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
900         if (r < 0)
901                 return r;
902         else if (r > 0) {
903
904                 if (ret)
905                         *ret = o;
906
907                 if (offset)
908                         *offset = p;
909
910                 return 0;
911         }
912
913         osize = offsetof(Object, field.payload) + size;
914         r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
915         if (r < 0)
916                 return r;
917
918         o->field.hash = htole64(hash);
919         memcpy(o->field.payload, field, size);
920
921         r = journal_file_link_field(f, o, p, hash);
922         if (r < 0)
923                 return r;
924
925         /* The linking might have altered the window, so let's
926          * refresh our pointer */
927         r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
928         if (r < 0)
929                 return r;
930
931 #ifdef HAVE_GCRYPT
932         r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
933         if (r < 0)
934                 return r;
935 #endif
936
937         if (ret)
938                 *ret = o;
939
940         if (offset)
941                 *offset = p;
942
943         return 0;
944 }
945
946 static int journal_file_append_data(
947                 JournalFile *f,
948                 const void *data, uint64_t size,
949                 Object **ret, uint64_t *offset) {
950
951         uint64_t hash, p;
952         uint64_t osize;
953         Object *o;
954         int r;
955         bool compressed = false;
956         const void *eq;
957
958         assert(f);
959         assert(data || size == 0);
960
961         hash = hash64(data, size);
962
963         r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
964         if (r < 0)
965                 return r;
966         else if (r > 0) {
967
968                 if (ret)
969                         *ret = o;
970
971                 if (offset)
972                         *offset = p;
973
974                 return 0;
975         }
976
977         osize = offsetof(Object, data.payload) + size;
978         r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
979         if (r < 0)
980                 return r;
981
982         o->data.hash = htole64(hash);
983
984 #ifdef HAVE_XZ
985         if (f->compress &&
986             size >= COMPRESSION_SIZE_THRESHOLD) {
987                 uint64_t rsize;
988
989                 compressed = compress_blob(data, size, o->data.payload, &rsize);
990
991                 if (compressed) {
992                         o->object.size = htole64(offsetof(Object, data.payload) + rsize);
993                         o->object.flags |= OBJECT_COMPRESSED;
994
995                         log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
996                 }
997         }
998 #endif
999
1000         if (!compressed && size > 0)
1001                 memcpy(o->data.payload, data, size);
1002
1003         r = journal_file_link_data(f, o, p, hash);
1004         if (r < 0)
1005                 return r;
1006
1007         /* The linking might have altered the window, so let's
1008          * refresh our pointer */
1009         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1010         if (r < 0)
1011                 return r;
1012
1013         eq = memchr(data, '=', size);
1014         if (eq && eq > data) {
1015                 uint64_t fp;
1016                 Object *fo;
1017
1018                 /* Create field object ... */
1019                 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1020                 if (r < 0)
1021                         return r;
1022
1023                 /* ... and link it in. */
1024                 o->data.next_field_offset = fo->field.head_data_offset;
1025                 fo->field.head_data_offset = le64toh(p);
1026         }
1027
1028 #ifdef HAVE_GCRYPT
1029         r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1030         if (r < 0)
1031                 return r;
1032 #endif
1033
1034         if (ret)
1035                 *ret = o;
1036
1037         if (offset)
1038                 *offset = p;
1039
1040         return 0;
1041 }
1042
1043 uint64_t journal_file_entry_n_items(Object *o) {
1044         assert(o);
1045
1046         if (o->object.type != OBJECT_ENTRY)
1047                 return 0;
1048
1049         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1050 }
1051
1052 uint64_t journal_file_entry_array_n_items(Object *o) {
1053         assert(o);
1054
1055         if (o->object.type != OBJECT_ENTRY_ARRAY)
1056                 return 0;
1057
1058         return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1059 }
1060
1061 uint64_t journal_file_hash_table_n_items(Object *o) {
1062         assert(o);
1063
1064         if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1065             o->object.type != OBJECT_FIELD_HASH_TABLE)
1066                 return 0;
1067
1068         return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1069 }
1070
1071 static int link_entry_into_array(JournalFile *f,
1072                                  le64_t *first,
1073                                  le64_t *idx,
1074                                  uint64_t p) {
1075         int r;
1076         uint64_t n = 0, ap = 0, q, i, a, hidx;
1077         Object *o;
1078
1079         assert(f);
1080         assert(first);
1081         assert(idx);
1082         assert(p > 0);
1083
1084         a = le64toh(*first);
1085         i = hidx = le64toh(*idx);
1086         while (a > 0) {
1087
1088                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1089                 if (r < 0)
1090                         return r;
1091
1092                 n = journal_file_entry_array_n_items(o);
1093                 if (i < n) {
1094                         o->entry_array.items[i] = htole64(p);
1095                         *idx = htole64(hidx + 1);
1096                         return 0;
1097                 }
1098
1099                 i -= n;
1100                 ap = a;
1101                 a = le64toh(o->entry_array.next_entry_array_offset);
1102         }
1103
1104         if (hidx > n)
1105                 n = (hidx+1) * 2;
1106         else
1107                 n = n * 2;
1108
1109         if (n < 4)
1110                 n = 4;
1111
1112         r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1113                                        offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1114                                        &o, &q);
1115         if (r < 0)
1116                 return r;
1117
1118 #ifdef HAVE_GCRYPT
1119         r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1120         if (r < 0)
1121                 return r;
1122 #endif
1123
1124         o->entry_array.items[i] = htole64(p);
1125
1126         if (ap == 0)
1127                 *first = htole64(q);
1128         else {
1129                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1130                 if (r < 0)
1131                         return r;
1132
1133                 o->entry_array.next_entry_array_offset = htole64(q);
1134         }
1135
1136         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1137                 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1138
1139         *idx = htole64(hidx + 1);
1140
1141         return 0;
1142 }
1143
1144 static int link_entry_into_array_plus_one(JournalFile *f,
1145                                           le64_t *extra,
1146                                           le64_t *first,
1147                                           le64_t *idx,
1148                                           uint64_t p) {
1149
1150         int r;
1151
1152         assert(f);
1153         assert(extra);
1154         assert(first);
1155         assert(idx);
1156         assert(p > 0);
1157
1158         if (*idx == 0)
1159                 *extra = htole64(p);
1160         else {
1161                 le64_t i;
1162
1163                 i = htole64(le64toh(*idx) - 1);
1164                 r = link_entry_into_array(f, first, &i, p);
1165                 if (r < 0)
1166                         return r;
1167         }
1168
1169         *idx = htole64(le64toh(*idx) + 1);
1170         return 0;
1171 }
1172
1173 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1174         uint64_t p;
1175         int r;
1176         assert(f);
1177         assert(o);
1178         assert(offset > 0);
1179
1180         p = le64toh(o->entry.items[i].object_offset);
1181         if (p == 0)
1182                 return -EINVAL;
1183
1184         r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1185         if (r < 0)
1186                 return r;
1187
1188         return link_entry_into_array_plus_one(f,
1189                                               &o->data.entry_offset,
1190                                               &o->data.entry_array_offset,
1191                                               &o->data.n_entries,
1192                                               offset);
1193 }
1194
1195 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1196         uint64_t n, i;
1197         int r;
1198
1199         assert(f);
1200         assert(o);
1201         assert(offset > 0);
1202
1203         if (o->object.type != OBJECT_ENTRY)
1204                 return -EINVAL;
1205
1206         __sync_synchronize();
1207
1208         /* Link up the entry itself */
1209         r = link_entry_into_array(f,
1210                                   &f->header->entry_array_offset,
1211                                   &f->header->n_entries,
1212                                   offset);
1213         if (r < 0)
1214                 return r;
1215
1216         /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1217
1218         if (f->header->head_entry_realtime == 0)
1219                 f->header->head_entry_realtime = o->entry.realtime;
1220
1221         f->header->tail_entry_realtime = o->entry.realtime;
1222         f->header->tail_entry_monotonic = o->entry.monotonic;
1223
1224         f->tail_entry_monotonic_valid = true;
1225
1226         /* Link up the items */
1227         n = journal_file_entry_n_items(o);
1228         for (i = 0; i < n; i++) {
1229                 r = journal_file_link_entry_item(f, o, offset, i);
1230                 if (r < 0)
1231                         return r;
1232         }
1233
1234         return 0;
1235 }
1236
1237 static int journal_file_append_entry_internal(
1238                 JournalFile *f,
1239                 const dual_timestamp *ts,
1240                 uint64_t xor_hash,
1241                 const EntryItem items[], unsigned n_items,
1242                 uint64_t *seqnum,
1243                 Object **ret, uint64_t *offset) {
1244         uint64_t np;
1245         uint64_t osize;
1246         Object *o;
1247         int r;
1248
1249         assert(f);
1250         assert(items || n_items == 0);
1251         assert(ts);
1252
1253         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1254
1255         r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1256         if (r < 0)
1257                 return r;
1258
1259         o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1260         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1261         o->entry.realtime = htole64(ts->realtime);
1262         o->entry.monotonic = htole64(ts->monotonic);
1263         o->entry.xor_hash = htole64(xor_hash);
1264         o->entry.boot_id = f->header->boot_id;
1265
1266 #ifdef HAVE_GCRYPT
1267         r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1268         if (r < 0)
1269                 return r;
1270 #endif
1271
1272         r = journal_file_link_entry(f, o, np);
1273         if (r < 0)
1274                 return r;
1275
1276         if (ret)
1277                 *ret = o;
1278
1279         if (offset)
1280                 *offset = np;
1281
1282         return 0;
1283 }
1284
1285 void journal_file_post_change(JournalFile *f) {
1286         assert(f);
1287
1288         /* inotify() does not receive IN_MODIFY events from file
1289          * accesses done via mmap(). After each access we hence
1290          * trigger IN_MODIFY by truncating the journal file to its
1291          * current size which triggers IN_MODIFY. */
1292
1293         __sync_synchronize();
1294
1295         if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1296                 log_error("Failed to truncate file to its own size: %m");
1297 }
1298
1299 static int entry_item_cmp(const void *_a, const void *_b) {
1300         const EntryItem *a = _a, *b = _b;
1301
1302         if (le64toh(a->object_offset) < le64toh(b->object_offset))
1303                 return -1;
1304         if (le64toh(a->object_offset) > le64toh(b->object_offset))
1305                 return 1;
1306         return 0;
1307 }
1308
1309 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1310         unsigned i;
1311         EntryItem *items;
1312         int r;
1313         uint64_t xor_hash = 0;
1314         struct dual_timestamp _ts;
1315
1316         assert(f);
1317         assert(iovec || n_iovec == 0);
1318
1319         if (!ts) {
1320                 dual_timestamp_get(&_ts);
1321                 ts = &_ts;
1322         }
1323
1324         if (f->tail_entry_monotonic_valid &&
1325             ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1326                 return -EINVAL;
1327
1328 #ifdef HAVE_GCRYPT
1329         r = journal_file_maybe_append_tag(f, ts->realtime);
1330         if (r < 0)
1331                 return r;
1332 #endif
1333
1334         /* alloca() can't take 0, hence let's allocate at least one */
1335         items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1336
1337         for (i = 0; i < n_iovec; i++) {
1338                 uint64_t p;
1339                 Object *o;
1340
1341                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1342                 if (r < 0)
1343                         return r;
1344
1345                 xor_hash ^= le64toh(o->data.hash);
1346                 items[i].object_offset = htole64(p);
1347                 items[i].hash = o->data.hash;
1348         }
1349
1350         /* Order by the position on disk, in order to improve seek
1351          * times for rotating media. */
1352         qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1353
1354         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1355
1356         journal_file_post_change(f);
1357
1358         return r;
1359 }
1360
1361 typedef struct ChainCacheItem {
1362         uint64_t first; /* the array at the begin of the chain */
1363         uint64_t array; /* the cached array */
1364         uint64_t begin; /* the first item in the cached array */
1365         uint64_t total; /* the total number of items in all arrays before this one in the chain */
1366         uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1367 } ChainCacheItem;
1368
1369 static void chain_cache_put(
1370                 Hashmap *h,
1371                 ChainCacheItem *ci,
1372                 uint64_t first,
1373                 uint64_t array,
1374                 uint64_t begin,
1375                 uint64_t total,
1376                 uint64_t last_index) {
1377
1378         if (!ci) {
1379                 /* If the chain item to cache for this chain is the
1380                  * first one it's not worth caching anything */
1381                 if (array == first)
1382                         return;
1383
1384                 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1385                         ci = hashmap_steal_first(h);
1386                 else {
1387                         ci = new(ChainCacheItem, 1);
1388                         if (!ci)
1389                                 return;
1390                 }
1391
1392                 ci->first = first;
1393
1394                 if (hashmap_put(h, &ci->first, ci) < 0) {
1395                         free(ci);
1396                         return;
1397                 }
1398         } else
1399                 assert(ci->first == first);
1400
1401         ci->array = array;
1402         ci->begin = begin;
1403         ci->total = total;
1404         ci->last_index = last_index;
1405 }
1406
1407 static int generic_array_get(
1408                 JournalFile *f,
1409                 uint64_t first,
1410                 uint64_t i,
1411                 Object **ret, uint64_t *offset) {
1412
1413         Object *o;
1414         uint64_t p = 0, a, t = 0;
1415         int r;
1416         ChainCacheItem *ci;
1417
1418         assert(f);
1419
1420         a = first;
1421
1422         /* Try the chain cache first */
1423         ci = hashmap_get(f->chain_cache, &first);
1424         if (ci && i > ci->total) {
1425                 a = ci->array;
1426                 i -= ci->total;
1427                 t = ci->total;
1428         }
1429
1430         while (a > 0) {
1431                 uint64_t k;
1432
1433                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1434                 if (r < 0)
1435                         return r;
1436
1437                 k = journal_file_entry_array_n_items(o);
1438                 if (i < k) {
1439                         p = le64toh(o->entry_array.items[i]);
1440                         goto found;
1441                 }
1442
1443                 i -= k;
1444                 t += k;
1445                 a = le64toh(o->entry_array.next_entry_array_offset);
1446         }
1447
1448         return 0;
1449
1450 found:
1451         /* Let's cache this item for the next invocation */
1452         chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t, i);
1453
1454         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455         if (r < 0)
1456                 return r;
1457
1458         if (ret)
1459                 *ret = o;
1460
1461         if (offset)
1462                 *offset = p;
1463
1464         return 1;
1465 }
1466
1467 static int generic_array_get_plus_one(
1468                 JournalFile *f,
1469                 uint64_t extra,
1470                 uint64_t first,
1471                 uint64_t i,
1472                 Object **ret, uint64_t *offset) {
1473
1474         Object *o;
1475
1476         assert(f);
1477
1478         if (i == 0) {
1479                 int r;
1480
1481                 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1482                 if (r < 0)
1483                         return r;
1484
1485                 if (ret)
1486                         *ret = o;
1487
1488                 if (offset)
1489                         *offset = extra;
1490
1491                 return 1;
1492         }
1493
1494         return generic_array_get(f, first, i-1, ret, offset);
1495 }
1496
1497 enum {
1498         TEST_FOUND,
1499         TEST_LEFT,
1500         TEST_RIGHT
1501 };
1502
1503 static int generic_array_bisect(
1504                 JournalFile *f,
1505                 uint64_t first,
1506                 uint64_t n,
1507                 uint64_t needle,
1508                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509                 direction_t direction,
1510                 Object **ret,
1511                 uint64_t *offset,
1512                 uint64_t *idx) {
1513
1514         uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1515         bool subtract_one = false;
1516         Object *o, *array = NULL;
1517         int r;
1518         ChainCacheItem *ci;
1519
1520         assert(f);
1521         assert(test_object);
1522
1523         /* Start with the first array in the chain */
1524         a = first;
1525
1526         ci = hashmap_get(f->chain_cache, &first);
1527         if (ci && n > ci->total) {
1528                 /* Ah, we have iterated this bisection array chain
1529                  * previously! Let's see if we can skip ahead in the
1530                  * chain, as far as the last time. But we can't jump
1531                  * backwards in the chain, so let's check that
1532                  * first. */
1533
1534                 r = test_object(f, ci->begin, needle);
1535                 if (r < 0)
1536                         return r;
1537
1538                 if (r == TEST_LEFT) {
1539                         /* OK, what we are looking for is right of the
1540                          * begin of this EntryArray, so let's jump
1541                          * straight to previously cached array in the
1542                          * chain */
1543
1544                         a = ci->array;
1545                         n -= ci->total;
1546                         t = ci->total;
1547                         last_index = ci->last_index;
1548                 }
1549         }
1550
1551         while (a > 0) {
1552                 uint64_t left, right, k, lp;
1553
1554                 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1555                 if (r < 0)
1556                         return r;
1557
1558                 k = journal_file_entry_array_n_items(array);
1559                 right = MIN(k, n);
1560                 if (right <= 0)
1561                         return 0;
1562
1563                 i = right - 1;
1564                 lp = p = le64toh(array->entry_array.items[i]);
1565                 if (p <= 0)
1566                         return -EBADMSG;
1567
1568                 r = test_object(f, p, needle);
1569                 if (r < 0)
1570                         return r;
1571
1572                 if (r == TEST_FOUND)
1573                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574
1575                 if (r == TEST_RIGHT) {
1576                         left = 0;
1577                         right -= 1;
1578
1579                         if (last_index != (uint64_t) -1) {
1580                                 assert(last_index <= right);
1581
1582                                 /* If we cached the last index we
1583                                  * looked at, let's try to not to jump
1584                                  * too wildly around and see if we can
1585                                  * limit the range to look at early to
1586                                  * the immediate neighbors of the last
1587                                  * index we looked at. */
1588
1589                                 if (last_index > 0) {
1590                                         uint64_t x = last_index - 1;
1591
1592                                         p = le64toh(array->entry_array.items[x]);
1593                                         if (p <= 0)
1594                                                 return -EBADMSG;
1595
1596                                         r = test_object(f, p, needle);
1597                                         if (r < 0)
1598                                                 return r;
1599
1600                                         if (r == TEST_FOUND)
1601                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602
1603                                         if (r == TEST_RIGHT)
1604                                                 right = x;
1605                                         else
1606                                                 left = x + 1;
1607                                 }
1608
1609                                 if (last_index < right) {
1610                                         uint64_t y = last_index + 1;
1611
1612                                         p = le64toh(array->entry_array.items[y]);
1613                                         if (p <= 0)
1614                                                 return -EBADMSG;
1615
1616                                         r = test_object(f, p, needle);
1617                                         if (r < 0)
1618                                                 return r;
1619
1620                                         if (r == TEST_FOUND)
1621                                                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622
1623                                         if (r == TEST_RIGHT)
1624                                                 right = y;
1625                                         else
1626                                                 left = y + 1;
1627                                 }
1628
1629                                 last_index = (uint64_t) -1;
1630                         }
1631
1632                         for (;;) {
1633                                 if (left == right) {
1634                                         if (direction == DIRECTION_UP)
1635                                                 subtract_one = true;
1636
1637                                         i = left;
1638                                         goto found;
1639                                 }
1640
1641                                 assert(left < right);
1642                                 i = (left + right) / 2;
1643
1644                                 p = le64toh(array->entry_array.items[i]);
1645                                 if (p <= 0)
1646                                         return -EBADMSG;
1647
1648                                 r = test_object(f, p, needle);
1649                                 if (r < 0)
1650                                         return r;
1651
1652                                 if (r == TEST_FOUND)
1653                                         r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1654
1655                                 if (r == TEST_RIGHT)
1656                                         right = i;
1657                                 else
1658                                         left = i + 1;
1659                         }
1660                 }
1661
1662                 if (k > n) {
1663                         if (direction == DIRECTION_UP) {
1664                                 i = n;
1665                                 subtract_one = true;
1666                                 goto found;
1667                         }
1668
1669                         return 0;
1670                 }
1671
1672                 last_p = lp;
1673
1674                 n -= k;
1675                 t += k;
1676                 last_index = (uint64_t) -1;
1677                 a = le64toh(array->entry_array.next_entry_array_offset);
1678         }
1679
1680         return 0;
1681
1682 found:
1683         if (subtract_one && t == 0 && i == 0)
1684                 return 0;
1685
1686         /* Let's cache this item for the next invocation */
1687         chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1688
1689         if (subtract_one && i == 0)
1690                 p = last_p;
1691         else if (subtract_one)
1692                 p = le64toh(array->entry_array.items[i-1]);
1693         else
1694                 p = le64toh(array->entry_array.items[i]);
1695
1696         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1697         if (r < 0)
1698                 return r;
1699
1700         if (ret)
1701                 *ret = o;
1702
1703         if (offset)
1704                 *offset = p;
1705
1706         if (idx)
1707                 *idx = t + i + (subtract_one ? -1 : 0);
1708
1709         return 1;
1710 }
1711
1712
1713 static int generic_array_bisect_plus_one(
1714                 JournalFile *f,
1715                 uint64_t extra,
1716                 uint64_t first,
1717                 uint64_t n,
1718                 uint64_t needle,
1719                 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1720                 direction_t direction,
1721                 Object **ret,
1722                 uint64_t *offset,
1723                 uint64_t *idx) {
1724
1725         int r;
1726         bool step_back = false;
1727         Object *o;
1728
1729         assert(f);
1730         assert(test_object);
1731
1732         if (n <= 0)
1733                 return 0;
1734
1735         /* This bisects the array in object 'first', but first checks
1736          * an extra  */
1737         r = test_object(f, extra, needle);
1738         if (r < 0)
1739                 return r;
1740
1741         if (r == TEST_FOUND)
1742                 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1743
1744         /* if we are looking with DIRECTION_UP then we need to first
1745            see if in the actual array there is a matching entry, and
1746            return the last one of that. But if there isn't any we need
1747            to return this one. Hence remember this, and return it
1748            below. */
1749         if (r == TEST_LEFT)
1750                 step_back = direction == DIRECTION_UP;
1751
1752         if (r == TEST_RIGHT) {
1753                 if (direction == DIRECTION_DOWN)
1754                         goto found;
1755                 else
1756                         return 0;
1757         }
1758
1759         r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1760
1761         if (r == 0 && step_back)
1762                 goto found;
1763
1764         if (r > 0 && idx)
1765                 (*idx) ++;
1766
1767         return r;
1768
1769 found:
1770         r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1771         if (r < 0)
1772                 return r;
1773
1774         if (ret)
1775                 *ret = o;
1776
1777         if (offset)
1778                 *offset = extra;
1779
1780         if (idx)
1781                 *idx = 0;
1782
1783         return 1;
1784 }
1785
1786 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1787         assert(f);
1788         assert(p > 0);
1789
1790         if (p == needle)
1791                 return TEST_FOUND;
1792         else if (p < needle)
1793                 return TEST_LEFT;
1794         else
1795                 return TEST_RIGHT;
1796 }
1797
1798 int journal_file_move_to_entry_by_offset(
1799                 JournalFile *f,
1800                 uint64_t p,
1801                 direction_t direction,
1802                 Object **ret,
1803                 uint64_t *offset) {
1804
1805         return generic_array_bisect(f,
1806                                     le64toh(f->header->entry_array_offset),
1807                                     le64toh(f->header->n_entries),
1808                                     p,
1809                                     test_object_offset,
1810                                     direction,
1811                                     ret, offset, NULL);
1812 }
1813
1814
1815 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1816         Object *o;
1817         int r;
1818
1819         assert(f);
1820         assert(p > 0);
1821
1822         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1823         if (r < 0)
1824                 return r;
1825
1826         if (le64toh(o->entry.seqnum) == needle)
1827                 return TEST_FOUND;
1828         else if (le64toh(o->entry.seqnum) < needle)
1829                 return TEST_LEFT;
1830         else
1831                 return TEST_RIGHT;
1832 }
1833
1834 int journal_file_move_to_entry_by_seqnum(
1835                 JournalFile *f,
1836                 uint64_t seqnum,
1837                 direction_t direction,
1838                 Object **ret,
1839                 uint64_t *offset) {
1840
1841         return generic_array_bisect(f,
1842                                     le64toh(f->header->entry_array_offset),
1843                                     le64toh(f->header->n_entries),
1844                                     seqnum,
1845                                     test_object_seqnum,
1846                                     direction,
1847                                     ret, offset, NULL);
1848 }
1849
1850 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1851         Object *o;
1852         int r;
1853
1854         assert(f);
1855         assert(p > 0);
1856
1857         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1858         if (r < 0)
1859                 return r;
1860
1861         if (le64toh(o->entry.realtime) == needle)
1862                 return TEST_FOUND;
1863         else if (le64toh(o->entry.realtime) < needle)
1864                 return TEST_LEFT;
1865         else
1866                 return TEST_RIGHT;
1867 }
1868
1869 int journal_file_move_to_entry_by_realtime(
1870                 JournalFile *f,
1871                 uint64_t realtime,
1872                 direction_t direction,
1873                 Object **ret,
1874                 uint64_t *offset) {
1875
1876         return generic_array_bisect(f,
1877                                     le64toh(f->header->entry_array_offset),
1878                                     le64toh(f->header->n_entries),
1879                                     realtime,
1880                                     test_object_realtime,
1881                                     direction,
1882                                     ret, offset, NULL);
1883 }
1884
1885 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1886         Object *o;
1887         int r;
1888
1889         assert(f);
1890         assert(p > 0);
1891
1892         r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1893         if (r < 0)
1894                 return r;
1895
1896         if (le64toh(o->entry.monotonic) == needle)
1897                 return TEST_FOUND;
1898         else if (le64toh(o->entry.monotonic) < needle)
1899                 return TEST_LEFT;
1900         else
1901                 return TEST_RIGHT;
1902 }
1903
1904 static inline int find_data_object_by_boot_id(
1905                 JournalFile *f,
1906                 sd_id128_t boot_id,
1907                 Object **o,
1908                 uint64_t *b) {
1909         char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1910
1911         sd_id128_to_string(boot_id, t + 9);
1912         return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1913 }
1914
1915 int journal_file_move_to_entry_by_monotonic(
1916                 JournalFile *f,
1917                 sd_id128_t boot_id,
1918                 uint64_t monotonic,
1919                 direction_t direction,
1920                 Object **ret,
1921                 uint64_t *offset) {
1922
1923         Object *o;
1924         int r;
1925
1926         assert(f);
1927
1928         r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1929         if (r < 0)
1930                 return r;
1931         if (r == 0)
1932                 return -ENOENT;
1933
1934         return generic_array_bisect_plus_one(f,
1935                                              le64toh(o->data.entry_offset),
1936                                              le64toh(o->data.entry_array_offset),
1937                                              le64toh(o->data.n_entries),
1938                                              monotonic,
1939                                              test_object_monotonic,
1940                                              direction,
1941                                              ret, offset, NULL);
1942 }
1943
1944 int journal_file_next_entry(
1945                 JournalFile *f,
1946                 Object *o, uint64_t p,
1947                 direction_t direction,
1948                 Object **ret, uint64_t *offset) {
1949
1950         uint64_t i, n;
1951         int r;
1952
1953         assert(f);
1954         assert(p > 0 || !o);
1955
1956         n = le64toh(f->header->n_entries);
1957         if (n <= 0)
1958                 return 0;
1959
1960         if (!o)
1961                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1962         else {
1963                 if (o->object.type != OBJECT_ENTRY)
1964                         return -EINVAL;
1965
1966                 r = generic_array_bisect(f,
1967                                          le64toh(f->header->entry_array_offset),
1968                                          le64toh(f->header->n_entries),
1969                                          p,
1970                                          test_object_offset,
1971                                          DIRECTION_DOWN,
1972                                          NULL, NULL,
1973                                          &i);
1974                 if (r <= 0)
1975                         return r;
1976
1977                 if (direction == DIRECTION_DOWN) {
1978                         if (i >= n - 1)
1979                                 return 0;
1980
1981                         i++;
1982                 } else {
1983                         if (i <= 0)
1984                                 return 0;
1985
1986                         i--;
1987                 }
1988         }
1989
1990         /* And jump to it */
1991         return generic_array_get(f,
1992                                  le64toh(f->header->entry_array_offset),
1993                                  i,
1994                                  ret, offset);
1995 }
1996
1997 int journal_file_skip_entry(
1998                 JournalFile *f,
1999                 Object *o, uint64_t p,
2000                 int64_t skip,
2001                 Object **ret, uint64_t *offset) {
2002
2003         uint64_t i, n;
2004         int r;
2005
2006         assert(f);
2007         assert(o);
2008         assert(p > 0);
2009
2010         if (o->object.type != OBJECT_ENTRY)
2011                 return -EINVAL;
2012
2013         r = generic_array_bisect(f,
2014                                  le64toh(f->header->entry_array_offset),
2015                                  le64toh(f->header->n_entries),
2016                                  p,
2017                                  test_object_offset,
2018                                  DIRECTION_DOWN,
2019                                  NULL, NULL,
2020                                  &i);
2021         if (r <= 0)
2022                 return r;
2023
2024         /* Calculate new index */
2025         if (skip < 0) {
2026                 if ((uint64_t) -skip >= i)
2027                         i = 0;
2028                 else
2029                         i = i - (uint64_t) -skip;
2030         } else
2031                 i  += (uint64_t) skip;
2032
2033         n = le64toh(f->header->n_entries);
2034         if (n <= 0)
2035                 return -EBADMSG;
2036
2037         if (i >= n)
2038                 i = n-1;
2039
2040         return generic_array_get(f,
2041                                  le64toh(f->header->entry_array_offset),
2042                                  i,
2043                                  ret, offset);
2044 }
2045
2046 int journal_file_next_entry_for_data(
2047                 JournalFile *f,
2048                 Object *o, uint64_t p,
2049                 uint64_t data_offset,
2050                 direction_t direction,
2051                 Object **ret, uint64_t *offset) {
2052
2053         uint64_t n, i;
2054         int r;
2055         Object *d;
2056
2057         assert(f);
2058         assert(p > 0 || !o);
2059
2060         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2061         if (r < 0)
2062                 return r;
2063
2064         n = le64toh(d->data.n_entries);
2065         if (n <= 0)
2066                 return n;
2067
2068         if (!o)
2069                 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2070         else {
2071                 if (o->object.type != OBJECT_ENTRY)
2072                         return -EINVAL;
2073
2074                 r = generic_array_bisect_plus_one(f,
2075                                                   le64toh(d->data.entry_offset),
2076                                                   le64toh(d->data.entry_array_offset),
2077                                                   le64toh(d->data.n_entries),
2078                                                   p,
2079                                                   test_object_offset,
2080                                                   DIRECTION_DOWN,
2081                                                   NULL, NULL,
2082                                                   &i);
2083
2084                 if (r <= 0)
2085                         return r;
2086
2087                 if (direction == DIRECTION_DOWN) {
2088                         if (i >= n - 1)
2089                                 return 0;
2090
2091                         i++;
2092                 } else {
2093                         if (i <= 0)
2094                                 return 0;
2095
2096                         i--;
2097                 }
2098
2099         }
2100
2101         return generic_array_get_plus_one(f,
2102                                           le64toh(d->data.entry_offset),
2103                                           le64toh(d->data.entry_array_offset),
2104                                           i,
2105                                           ret, offset);
2106 }
2107
2108 int journal_file_move_to_entry_by_offset_for_data(
2109                 JournalFile *f,
2110                 uint64_t data_offset,
2111                 uint64_t p,
2112                 direction_t direction,
2113                 Object **ret, uint64_t *offset) {
2114
2115         int r;
2116         Object *d;
2117
2118         assert(f);
2119
2120         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2121         if (r < 0)
2122                 return r;
2123
2124         return generic_array_bisect_plus_one(f,
2125                                              le64toh(d->data.entry_offset),
2126                                              le64toh(d->data.entry_array_offset),
2127                                              le64toh(d->data.n_entries),
2128                                              p,
2129                                              test_object_offset,
2130                                              direction,
2131                                              ret, offset, NULL);
2132 }
2133
2134 int journal_file_move_to_entry_by_monotonic_for_data(
2135                 JournalFile *f,
2136                 uint64_t data_offset,
2137                 sd_id128_t boot_id,
2138                 uint64_t monotonic,
2139                 direction_t direction,
2140                 Object **ret, uint64_t *offset) {
2141
2142         Object *o, *d;
2143         int r;
2144         uint64_t b, z;
2145
2146         assert(f);
2147
2148         /* First, seek by time */
2149         r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2150         if (r < 0)
2151                 return r;
2152         if (r == 0)
2153                 return -ENOENT;
2154
2155         r = generic_array_bisect_plus_one(f,
2156                                           le64toh(o->data.entry_offset),
2157                                           le64toh(o->data.entry_array_offset),
2158                                           le64toh(o->data.n_entries),
2159                                           monotonic,
2160                                           test_object_monotonic,
2161                                           direction,
2162                                           NULL, &z, NULL);
2163         if (r <= 0)
2164                 return r;
2165
2166         /* And now, continue seeking until we find an entry that
2167          * exists in both bisection arrays */
2168
2169         for (;;) {
2170                 Object *qo;
2171                 uint64_t p, q;
2172
2173                 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2174                 if (r < 0)
2175                         return r;
2176
2177                 r = generic_array_bisect_plus_one(f,
2178                                                   le64toh(d->data.entry_offset),
2179                                                   le64toh(d->data.entry_array_offset),
2180                                                   le64toh(d->data.n_entries),
2181                                                   z,
2182                                                   test_object_offset,
2183                                                   direction,
2184                                                   NULL, &p, NULL);
2185                 if (r <= 0)
2186                         return r;
2187
2188                 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2189                 if (r < 0)
2190                         return r;
2191
2192                 r = generic_array_bisect_plus_one(f,
2193                                                   le64toh(o->data.entry_offset),
2194                                                   le64toh(o->data.entry_array_offset),
2195                                                   le64toh(o->data.n_entries),
2196                                                   p,
2197                                                   test_object_offset,
2198                                                   direction,
2199                                                   &qo, &q, NULL);
2200
2201                 if (r <= 0)
2202                         return r;
2203
2204                 if (p == q) {
2205                         if (ret)
2206                                 *ret = qo;
2207                         if (offset)
2208                                 *offset = q;
2209
2210                         return 1;
2211                 }
2212
2213                 z = q;
2214         }
2215
2216         return 0;
2217 }
2218
2219 int journal_file_move_to_entry_by_seqnum_for_data(
2220                 JournalFile *f,
2221                 uint64_t data_offset,
2222                 uint64_t seqnum,
2223                 direction_t direction,
2224                 Object **ret, uint64_t *offset) {
2225
2226         Object *d;
2227         int r;
2228
2229         assert(f);
2230
2231         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2232         if (r < 0)
2233                 return r;
2234
2235         return generic_array_bisect_plus_one(f,
2236                                              le64toh(d->data.entry_offset),
2237                                              le64toh(d->data.entry_array_offset),
2238                                              le64toh(d->data.n_entries),
2239                                              seqnum,
2240                                              test_object_seqnum,
2241                                              direction,
2242                                              ret, offset, NULL);
2243 }
2244
2245 int journal_file_move_to_entry_by_realtime_for_data(
2246                 JournalFile *f,
2247                 uint64_t data_offset,
2248                 uint64_t realtime,
2249                 direction_t direction,
2250                 Object **ret, uint64_t *offset) {
2251
2252         Object *d;
2253         int r;
2254
2255         assert(f);
2256
2257         r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2258         if (r < 0)
2259                 return r;
2260
2261         return generic_array_bisect_plus_one(f,
2262                                              le64toh(d->data.entry_offset),
2263                                              le64toh(d->data.entry_array_offset),
2264                                              le64toh(d->data.n_entries),
2265                                              realtime,
2266                                              test_object_realtime,
2267                                              direction,
2268                                              ret, offset, NULL);
2269 }
2270
2271 void journal_file_dump(JournalFile *f) {
2272         Object *o;
2273         int r;
2274         uint64_t p;
2275
2276         assert(f);
2277
2278         journal_file_print_header(f);
2279
2280         p = le64toh(f->header->header_size);
2281         while (p != 0) {
2282                 r = journal_file_move_to_object(f, -1, p, &o);
2283                 if (r < 0)
2284                         goto fail;
2285
2286                 switch (o->object.type) {
2287
2288                 case OBJECT_UNUSED:
2289                         printf("Type: OBJECT_UNUSED\n");
2290                         break;
2291
2292                 case OBJECT_DATA:
2293                         printf("Type: OBJECT_DATA\n");
2294                         break;
2295
2296                 case OBJECT_FIELD:
2297                         printf("Type: OBJECT_FIELD\n");
2298                         break;
2299
2300                 case OBJECT_ENTRY:
2301                         printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2302                                le64toh(o->entry.seqnum),
2303                                le64toh(o->entry.monotonic),
2304                                le64toh(o->entry.realtime));
2305                         break;
2306
2307                 case OBJECT_FIELD_HASH_TABLE:
2308                         printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2309                         break;
2310
2311                 case OBJECT_DATA_HASH_TABLE:
2312                         printf("Type: OBJECT_DATA_HASH_TABLE\n");
2313                         break;
2314
2315                 case OBJECT_ENTRY_ARRAY:
2316                         printf("Type: OBJECT_ENTRY_ARRAY\n");
2317                         break;
2318
2319                 case OBJECT_TAG:
2320                         printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2321                                le64toh(o->tag.seqnum),
2322                                le64toh(o->tag.epoch));
2323                         break;
2324
2325                 default:
2326                         printf("Type: unknown (%u)\n", o->object.type);
2327                         break;
2328                 }
2329
2330                 if (o->object.flags & OBJECT_COMPRESSED)
2331                         printf("Flags: COMPRESSED\n");
2332
2333                 if (p == le64toh(f->header->tail_object_offset))
2334                         p = 0;
2335                 else
2336                         p = p + ALIGN64(le64toh(o->object.size));
2337         }
2338
2339         return;
2340 fail:
2341         log_error("File corrupt");
2342 }
2343
2344 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2345         const char *x;
2346
2347         x = format_timestamp(buf, l, t);
2348         if (x)
2349                 return x;
2350         return " --- ";
2351 }
2352
2353 void journal_file_print_header(JournalFile *f) {
2354         char a[33], b[33], c[33], d[33];
2355         char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2356         struct stat st;
2357         char bytes[FORMAT_BYTES_MAX];
2358
2359         assert(f);
2360
2361         printf("File Path: %s\n"
2362                "File ID: %s\n"
2363                "Machine ID: %s\n"
2364                "Boot ID: %s\n"
2365                "Sequential Number ID: %s\n"
2366                "State: %s\n"
2367                "Compatible Flags:%s%s\n"
2368                "Incompatible Flags:%s%s\n"
2369                "Header size: %"PRIu64"\n"
2370                "Arena size: %"PRIu64"\n"
2371                "Data Hash Table Size: %"PRIu64"\n"
2372                "Field Hash Table Size: %"PRIu64"\n"
2373                "Rotate Suggested: %s\n"
2374                "Head Sequential Number: %"PRIu64"\n"
2375                "Tail Sequential Number: %"PRIu64"\n"
2376                "Head Realtime Timestamp: %s\n"
2377                "Tail Realtime Timestamp: %s\n"
2378                "Tail Monotonic Timestamp: %s\n"
2379                "Objects: %"PRIu64"\n"
2380                "Entry Objects: %"PRIu64"\n",
2381                f->path,
2382                sd_id128_to_string(f->header->file_id, a),
2383                sd_id128_to_string(f->header->machine_id, b),
2384                sd_id128_to_string(f->header->boot_id, c),
2385                sd_id128_to_string(f->header->seqnum_id, d),
2386                f->header->state == STATE_OFFLINE ? "OFFLINE" :
2387                f->header->state == STATE_ONLINE ? "ONLINE" :
2388                f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2389                JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2390                (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2391                JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2392                (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2393                le64toh(f->header->header_size),
2394                le64toh(f->header->arena_size),
2395                le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2396                le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2397                yes_no(journal_file_rotate_suggested(f, 0)),
2398                le64toh(f->header->head_entry_seqnum),
2399                le64toh(f->header->tail_entry_seqnum),
2400                format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2401                format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2402                format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2403                le64toh(f->header->n_objects),
2404                le64toh(f->header->n_entries));
2405
2406         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2407                 printf("Data Objects: %"PRIu64"\n"
2408                        "Data Hash Table Fill: %.1f%%\n",
2409                        le64toh(f->header->n_data),
2410                        100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2411
2412         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2413                 printf("Field Objects: %"PRIu64"\n"
2414                        "Field Hash Table Fill: %.1f%%\n",
2415                        le64toh(f->header->n_fields),
2416                        100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2417
2418         if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2419                 printf("Tag Objects: %"PRIu64"\n",
2420                        le64toh(f->header->n_tags));
2421         if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2422                 printf("Entry Array Objects: %"PRIu64"\n",
2423                        le64toh(f->header->n_entry_arrays));
2424
2425         if (fstat(f->fd, &st) >= 0)
2426                 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2427 }
2428
2429 int journal_file_open(
2430                 const char *fname,
2431                 int flags,
2432                 mode_t mode,
2433                 bool compress,
2434                 bool seal,
2435                 JournalMetrics *metrics,
2436                 MMapCache *mmap_cache,
2437                 JournalFile *template,
2438                 JournalFile **ret) {
2439
2440         JournalFile *f;
2441         int r;
2442         bool newly_created = false;
2443
2444         assert(fname);
2445         assert(ret);
2446
2447         if ((flags & O_ACCMODE) != O_RDONLY &&
2448             (flags & O_ACCMODE) != O_RDWR)
2449                 return -EINVAL;
2450
2451         if (!endswith(fname, ".journal") &&
2452             !endswith(fname, ".journal~"))
2453                 return -EINVAL;
2454
2455         f = new0(JournalFile, 1);
2456         if (!f)
2457                 return -ENOMEM;
2458
2459         f->fd = -1;
2460         f->mode = mode;
2461
2462         f->flags = flags;
2463         f->prot = prot_from_flags(flags);
2464         f->writable = (flags & O_ACCMODE) != O_RDONLY;
2465 #ifdef HAVE_XZ
2466         f->compress = compress;
2467 #endif
2468 #ifdef HAVE_GCRYPT
2469         f->seal = seal;
2470 #endif
2471
2472         if (mmap_cache)
2473                 f->mmap = mmap_cache_ref(mmap_cache);
2474         else {
2475                 f->mmap = mmap_cache_new();
2476                 if (!f->mmap) {
2477                         r = -ENOMEM;
2478                         goto fail;
2479                 }
2480         }
2481
2482         f->path = strdup(fname);
2483         if (!f->path) {
2484                 r = -ENOMEM;
2485                 goto fail;
2486         }
2487
2488         f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2489         if (!f->chain_cache) {
2490                 r = -ENOMEM;
2491                 goto fail;
2492         }
2493
2494         f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2495         if (f->fd < 0) {
2496                 r = -errno;
2497                 goto fail;
2498         }
2499
2500         if (fstat(f->fd, &f->last_stat) < 0) {
2501                 r = -errno;
2502                 goto fail;
2503         }
2504
2505         if (f->last_stat.st_size == 0 && f->writable) {
2506 #ifdef HAVE_XATTR
2507                 uint64_t crtime;
2508
2509                 /* Let's attach the creation time to the journal file,
2510                  * so that the vacuuming code knows the age of this
2511                  * file even if the file might end up corrupted one
2512                  * day... Ideally we'd just use the creation time many
2513                  * file systems maintain for each file, but there is
2514                  * currently no usable API to query this, hence let's
2515                  * emulate this via extended attributes. If extended
2516                  * attributes are not supported we'll just skip this,
2517                  * and rely solely on mtime/atime/ctime of the file.*/
2518
2519                 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2520                 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2521 #endif
2522
2523 #ifdef HAVE_GCRYPT
2524                 /* Try to load the FSPRG state, and if we can't, then
2525                  * just don't do sealing */
2526                 if (f->seal) {
2527                         r = journal_file_fss_load(f);
2528                         if (r < 0)
2529                                 f->seal = false;
2530                 }
2531 #endif
2532
2533                 r = journal_file_init_header(f, template);
2534                 if (r < 0)
2535                         goto fail;
2536
2537                 if (fstat(f->fd, &f->last_stat) < 0) {
2538                         r = -errno;
2539                         goto fail;
2540                 }
2541
2542                 newly_created = true;
2543         }
2544
2545         if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2546                 r = -EIO;
2547                 goto fail;
2548         }
2549
2550         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2551         if (f->header == MAP_FAILED) {
2552                 f->header = NULL;
2553                 r = -errno;
2554                 goto fail;
2555         }
2556
2557         if (!newly_created) {
2558                 r = journal_file_verify_header(f);
2559                 if (r < 0)
2560                         goto fail;
2561         }
2562
2563 #ifdef HAVE_GCRYPT
2564         if (!newly_created && f->writable) {
2565                 r = journal_file_fss_load(f);
2566                 if (r < 0)
2567                         goto fail;
2568         }
2569 #endif
2570
2571         if (f->writable) {
2572                 if (metrics) {
2573                         journal_default_metrics(metrics, f->fd);
2574                         f->metrics = *metrics;
2575                 } else if (template)
2576                         f->metrics = template->metrics;
2577
2578                 r = journal_file_refresh_header(f);
2579                 if (r < 0)
2580                         goto fail;
2581         }
2582
2583 #ifdef HAVE_GCRYPT
2584         r = journal_file_hmac_setup(f);
2585         if (r < 0)
2586                 goto fail;
2587 #endif
2588
2589         if (newly_created) {
2590                 r = journal_file_setup_field_hash_table(f);
2591                 if (r < 0)
2592                         goto fail;
2593
2594                 r = journal_file_setup_data_hash_table(f);
2595                 if (r < 0)
2596                         goto fail;
2597
2598 #ifdef HAVE_GCRYPT
2599                 r = journal_file_append_first_tag(f);
2600                 if (r < 0)
2601                         goto fail;
2602 #endif
2603         }
2604
2605         r = journal_file_map_field_hash_table(f);
2606         if (r < 0)
2607                 goto fail;
2608
2609         r = journal_file_map_data_hash_table(f);
2610         if (r < 0)
2611                 goto fail;
2612
2613         *ret = f;
2614         return 0;
2615
2616 fail:
2617         journal_file_close(f);
2618
2619         return r;
2620 }
2621
2622 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2623         _cleanup_free_ char *p = NULL;
2624         size_t l;
2625         JournalFile *old_file, *new_file = NULL;
2626         int r;
2627
2628         assert(f);
2629         assert(*f);
2630
2631         old_file = *f;
2632
2633         if (!old_file->writable)
2634                 return -EINVAL;
2635
2636         if (!endswith(old_file->path, ".journal"))
2637                 return -EINVAL;
2638
2639         l = strlen(old_file->path);
2640         r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2641                      (int) l - 8, old_file->path,
2642                      SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2643                      le64toh((*f)->header->head_entry_seqnum),
2644                      le64toh((*f)->header->head_entry_realtime));
2645         if (r < 0)
2646                 return -ENOMEM;
2647
2648         r = rename(old_file->path, p);
2649         if (r < 0)
2650                 return -errno;
2651
2652         old_file->header->state = STATE_ARCHIVED;
2653
2654         r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2655         journal_file_close(old_file);
2656
2657         *f = new_file;
2658         return r;
2659 }
2660
2661 int journal_file_open_reliably(
2662                 const char *fname,
2663                 int flags,
2664                 mode_t mode,
2665                 bool compress,
2666                 bool seal,
2667                 JournalMetrics *metrics,
2668                 MMapCache *mmap_cache,
2669                 JournalFile *template,
2670                 JournalFile **ret) {
2671
2672         int r;
2673         size_t l;
2674         _cleanup_free_ char *p = NULL;
2675
2676         r = journal_file_open(fname, flags, mode, compress, seal,
2677                               metrics, mmap_cache, template, ret);
2678         if (r != -EBADMSG && /* corrupted */
2679             r != -ENODATA && /* truncated */
2680             r != -EHOSTDOWN && /* other machine */
2681             r != -EPROTONOSUPPORT && /* incompatible feature */
2682             r != -EBUSY && /* unclean shutdown */
2683             r != -ESHUTDOWN /* already archived */)
2684                 return r;
2685
2686         if ((flags & O_ACCMODE) == O_RDONLY)
2687                 return r;
2688
2689         if (!(flags & O_CREAT))
2690                 return r;
2691
2692         if (!endswith(fname, ".journal"))
2693                 return r;
2694
2695         /* The file is corrupted. Rotate it away and try it again (but only once) */
2696
2697         l = strlen(fname);
2698         if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2699                      (int) l - 8, fname,
2700                      (unsigned long long) now(CLOCK_REALTIME),
2701                      random_ull()) < 0)
2702                 return -ENOMEM;
2703
2704         r = rename(fname, p);
2705         if (r < 0)
2706                 return -errno;
2707
2708         log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2709
2710         return journal_file_open(fname, flags, mode, compress, seal,
2711                                  metrics, mmap_cache, template, ret);
2712 }
2713
2714 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2715         uint64_t i, n;
2716         uint64_t q, xor_hash = 0;
2717         int r;
2718         EntryItem *items;
2719         dual_timestamp ts;
2720
2721         assert(from);
2722         assert(to);
2723         assert(o);
2724         assert(p);
2725
2726         if (!to->writable)
2727                 return -EPERM;
2728
2729         ts.monotonic = le64toh(o->entry.monotonic);
2730         ts.realtime = le64toh(o->entry.realtime);
2731
2732         n = journal_file_entry_n_items(o);
2733         items = alloca(sizeof(EntryItem) * n);
2734
2735         for (i = 0; i < n; i++) {
2736                 uint64_t l, h;
2737                 le64_t le_hash;
2738                 size_t t;
2739                 void *data;
2740                 Object *u;
2741
2742                 q = le64toh(o->entry.items[i].object_offset);
2743                 le_hash = o->entry.items[i].hash;
2744
2745                 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2746                 if (r < 0)
2747                         return r;
2748
2749                 if (le_hash != o->data.hash)
2750                         return -EBADMSG;
2751
2752                 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2753                 t = (size_t) l;
2754
2755                 /* We hit the limit on 32bit machines */
2756                 if ((uint64_t) t != l)
2757                         return -E2BIG;
2758
2759                 if (o->object.flags & OBJECT_COMPRESSED) {
2760 #ifdef HAVE_XZ
2761                         uint64_t rsize;
2762
2763                         if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2764                                 return -EBADMSG;
2765
2766                         data = from->compress_buffer;
2767                         l = rsize;
2768 #else
2769                         return -EPROTONOSUPPORT;
2770 #endif
2771                 } else
2772                         data = o->data.payload;
2773
2774                 r = journal_file_append_data(to, data, l, &u, &h);
2775                 if (r < 0)
2776                         return r;
2777
2778                 xor_hash ^= le64toh(u->data.hash);
2779                 items[i].object_offset = htole64(h);
2780                 items[i].hash = u->data.hash;
2781
2782                 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2783                 if (r < 0)
2784                         return r;
2785         }
2786
2787         return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2788 }
2789
2790 void journal_default_metrics(JournalMetrics *m, int fd) {
2791         uint64_t fs_size = 0;
2792         struct statvfs ss;
2793         char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2794
2795         assert(m);
2796         assert(fd >= 0);
2797
2798         if (fstatvfs(fd, &ss) >= 0)
2799                 fs_size = ss.f_frsize * ss.f_blocks;
2800
2801         if (m->max_use == (uint64_t) -1) {
2802
2803                 if (fs_size > 0) {
2804                         m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2805
2806                         if (m->max_use > DEFAULT_MAX_USE_UPPER)
2807                                 m->max_use = DEFAULT_MAX_USE_UPPER;
2808
2809                         if (m->max_use < DEFAULT_MAX_USE_LOWER)
2810                                 m->max_use = DEFAULT_MAX_USE_LOWER;
2811                 } else
2812                         m->max_use = DEFAULT_MAX_USE_LOWER;
2813         } else {
2814                 m->max_use = PAGE_ALIGN(m->max_use);
2815
2816                 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2817                         m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2818         }
2819
2820         if (m->max_size == (uint64_t) -1) {
2821                 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2822
2823                 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2824                         m->max_size = DEFAULT_MAX_SIZE_UPPER;
2825         } else
2826                 m->max_size = PAGE_ALIGN(m->max_size);
2827
2828         if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2829                 m->max_size = JOURNAL_FILE_SIZE_MIN;
2830
2831         if (m->max_size*2 > m->max_use)
2832                 m->max_use = m->max_size*2;
2833
2834         if (m->min_size == (uint64_t) -1)
2835                 m->min_size = JOURNAL_FILE_SIZE_MIN;
2836         else {
2837                 m->min_size = PAGE_ALIGN(m->min_size);
2838
2839                 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2840                         m->min_size = JOURNAL_FILE_SIZE_MIN;
2841
2842                 if (m->min_size > m->max_size)
2843                         m->max_size = m->min_size;
2844         }
2845
2846         if (m->keep_free == (uint64_t) -1) {
2847
2848                 if (fs_size > 0) {
2849                         m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2850
2851                         if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2852                                 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2853
2854                 } else
2855                         m->keep_free = DEFAULT_KEEP_FREE;
2856         }
2857
2858         log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2859                   format_bytes(a, sizeof(a), m->max_use),
2860                   format_bytes(b, sizeof(b), m->max_size),
2861                   format_bytes(c, sizeof(c), m->min_size),
2862                   format_bytes(d, sizeof(d), m->keep_free));
2863 }
2864
2865 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2866         assert(f);
2867         assert(from || to);
2868
2869         if (from) {
2870                 if (f->header->head_entry_realtime == 0)
2871                         return -ENOENT;
2872
2873                 *from = le64toh(f->header->head_entry_realtime);
2874         }
2875
2876         if (to) {
2877                 if (f->header->tail_entry_realtime == 0)
2878                         return -ENOENT;
2879
2880                 *to = le64toh(f->header->tail_entry_realtime);
2881         }
2882
2883         return 1;
2884 }
2885
2886 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2887         Object *o;
2888         uint64_t p;
2889         int r;
2890
2891         assert(f);
2892         assert(from || to);
2893
2894         r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2895         if (r <= 0)
2896                 return r;
2897
2898         if (le64toh(o->data.n_entries) <= 0)
2899                 return 0;
2900
2901         if (from) {
2902                 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2903                 if (r < 0)
2904                         return r;
2905
2906                 *from = le64toh(o->entry.monotonic);
2907         }
2908
2909         if (to) {
2910                 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2911                 if (r < 0)
2912                         return r;
2913
2914                 r = generic_array_get_plus_one(f,
2915                                                le64toh(o->data.entry_offset),
2916                                                le64toh(o->data.entry_array_offset),
2917                                                le64toh(o->data.n_entries)-1,
2918                                                &o, NULL);
2919                 if (r <= 0)
2920                         return r;
2921
2922                 *to = le64toh(o->entry.monotonic);
2923         }
2924
2925         return 1;
2926 }
2927
2928 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2929         assert(f);
2930
2931         /* If we gained new header fields we gained new features,
2932          * hence suggest a rotation */
2933         if (le64toh(f->header->header_size) < sizeof(Header)) {
2934                 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2935                 return true;
2936         }
2937
2938         /* Let's check if the hash tables grew over a certain fill
2939          * level (75%, borrowing this value from Java's hash table
2940          * implementation), and if so suggest a rotation. To calculate
2941          * the fill level we need the n_data field, which only exists
2942          * in newer versions. */
2943
2944         if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2945                 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2946                         log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2947                                   f->path,
2948                                   100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2949                                   le64toh(f->header->n_data),
2950                                   le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2951                                   (unsigned long long) f->last_stat.st_size,
2952                                   f->last_stat.st_size / le64toh(f->header->n_data));
2953                         return true;
2954                 }
2955
2956         if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2957                 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2958                         log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2959                                   f->path,
2960                                   100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2961                                   le64toh(f->header->n_fields),
2962                                   le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2963                         return true;
2964                 }
2965
2966         /* Are the data objects properly indexed by field objects? */
2967         if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2968             JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2969             le64toh(f->header->n_data) > 0 &&
2970             le64toh(f->header->n_fields) == 0)
2971                 return true;
2972
2973         if (max_file_usec > 0) {
2974                 usec_t t, h;
2975
2976                 h = le64toh(f->header->head_entry_realtime);
2977                 t = now(CLOCK_REALTIME);
2978
2979                 if (h > 0 && t > h + max_file_usec)
2980                         return true;
2981         }
2982
2983         return false;
2984 }