1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
36 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41 /* This is the minimum journal file size */
42 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
44 /* These are the lower and upper bounds if we deduce the max_use value
45 * from the file system size */
46 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
47 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49 /* This is the upper bound if we deduce max_size from max_use */
50 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
52 /* This is the upper bound if we deduce the keep_free value from the
54 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56 /* This is the keep_free value when we can't determine the system
58 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60 /* n_data was the first entry we added after the initial file format design */
61 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65 #define JOURNAL_HEADER_CONTAINS(h, field) \
66 (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
68 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime);
69 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p);
71 void journal_file_close(JournalFile *f) {
74 /* Write the final tag */
76 journal_file_append_tag(f);
78 /* Sync everything to disk, before we mark the file offline */
79 if (f->mmap && f->fd >= 0)
80 mmap_cache_close_fd(f->mmap, f->fd);
82 if (f->writable && f->fd >= 0)
86 /* Mark the file offline. Don't override the archived state if it already is set */
87 if (f->writable && f->header->state == STATE_ONLINE)
88 f->header->state = STATE_OFFLINE;
90 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
94 close_nointr_nofail(f->fd);
99 mmap_cache_unref(f->mmap);
102 free(f->compress_buffer);
107 munmap(f->fsprg_header, PAGE_ALIGN(f->fsprg_size));
110 gcry_md_close(f->hmac);
116 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
124 memcpy(h.signature, HEADER_SIGNATURE, 8);
125 h.header_size = htole64(ALIGN64(sizeof(h)));
127 h.incompatible_flags =
128 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
131 htole32(f->authenticate ? HEADER_COMPATIBLE_AUTHENTICATED : 0);
133 r = sd_id128_randomize(&h.file_id);
138 h.seqnum_id = template->header->seqnum_id;
139 h.tail_seqnum = template->header->tail_seqnum;
141 h.seqnum_id = h.file_id;
143 k = pwrite(f->fd, &h, sizeof(h), 0);
153 static int journal_file_refresh_header(JournalFile *f) {
159 r = sd_id128_get_machine(&f->header->machine_id);
163 r = sd_id128_get_boot(&boot_id);
167 if (sd_id128_equal(boot_id, f->header->boot_id))
168 f->tail_entry_monotonic_valid = true;
170 f->header->boot_id = boot_id;
172 f->header->state = STATE_ONLINE;
174 /* Sync the online state to disk */
175 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
181 static int journal_file_verify_header(JournalFile *f) {
184 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
187 /* In both read and write mode we refuse to open files with
188 * incompatible flags we don't know */
190 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
191 return -EPROTONOSUPPORT;
193 if (f->header->incompatible_flags != 0)
194 return -EPROTONOSUPPORT;
197 /* When open for writing we refuse to open files with
198 * compatible flags, too */
201 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_AUTHENTICATED) != 0)
202 return -EPROTONOSUPPORT;
204 if (f->header->compatible_flags != 0)
205 return -EPROTONOSUPPORT;
209 /* The first addition was n_data, so check that we are at least this large */
210 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
213 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
218 sd_id128_t machine_id;
221 r = sd_id128_get_machine(&machine_id);
225 if (!sd_id128_equal(machine_id, f->header->machine_id))
228 state = f->header->state;
230 if (state == STATE_ONLINE) {
231 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
233 } else if (state == STATE_ARCHIVED)
235 else if (state != STATE_OFFLINE) {
236 log_debug("Journal file %s has unknown state %u.", f->path, state);
241 f->compress = !!(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED);
242 f->authenticate = !!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED);
247 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
248 uint64_t old_size, new_size;
253 /* We assume that this file is not sparse, and we know that
254 * for sure, since we always call posix_fallocate()
258 le64toh(f->header->header_size) +
259 le64toh(f->header->arena_size);
261 new_size = PAGE_ALIGN(offset + size);
262 if (new_size < le64toh(f->header->header_size))
263 new_size = le64toh(f->header->header_size);
265 if (new_size <= old_size)
268 if (f->metrics.max_size > 0 &&
269 new_size > f->metrics.max_size)
272 if (new_size > f->metrics.min_size &&
273 f->metrics.keep_free > 0) {
276 if (fstatvfs(f->fd, &svfs) >= 0) {
279 available = svfs.f_bfree * svfs.f_bsize;
281 if (available >= f->metrics.keep_free)
282 available -= f->metrics.keep_free;
286 if (new_size - old_size > available)
291 /* Note that the glibc fallocate() fallback is very
292 inefficient, hence we try to minimize the allocation area
294 r = posix_fallocate(f->fd, old_size, new_size - old_size);
298 if (fstat(f->fd, &f->last_stat) < 0)
301 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
306 static int journal_file_move_to(JournalFile *f, int context, uint64_t offset, uint64_t size, void **ret) {
310 /* Avoid SIGBUS on invalid accesses */
311 if (offset + size > (uint64_t) f->last_stat.st_size) {
312 /* Hmm, out of range? Let's refresh the fstat() data
313 * first, before we trust that check. */
315 if (fstat(f->fd, &f->last_stat) < 0 ||
316 offset + size > (uint64_t) f->last_stat.st_size)
317 return -EADDRNOTAVAIL;
320 return mmap_cache_get(f->mmap, f->fd, f->prot, context, offset, size, ret);
323 static bool verify_hash(Object *o) {
328 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
329 h1 = le64toh(o->data.hash);
330 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
331 } else if (o->object.type == OBJECT_FIELD) {
332 h1 = le64toh(o->field.hash);
333 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
340 static uint64_t minimum_header_size(Object *o) {
342 static uint64_t table[] = {
343 [OBJECT_DATA] = sizeof(DataObject),
344 [OBJECT_FIELD] = sizeof(FieldObject),
345 [OBJECT_ENTRY] = sizeof(EntryObject),
346 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
347 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
348 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
349 [OBJECT_TAG] = sizeof(TagObject),
352 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
353 return sizeof(ObjectHeader);
355 return table[o->object.type];
358 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
368 /* One context for each type, plus one catch-all for the rest */
369 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
371 r = journal_file_move_to(f, context, offset, sizeof(ObjectHeader), &t);
376 s = le64toh(o->object.size);
378 if (s < sizeof(ObjectHeader))
381 if (o->object.type <= OBJECT_UNUSED)
384 if (s < minimum_header_size(o))
387 if (type >= 0 && o->object.type != type)
390 if (s > sizeof(ObjectHeader)) {
391 r = journal_file_move_to(f, o->object.type, offset, s, &t);
405 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
410 r = le64toh(f->header->tail_seqnum) + 1;
413 /* If an external seqnum counter was passed, we update
414 * both the local and the external one, and set it to
415 * the maximum of both */
423 f->header->tail_seqnum = htole64(r);
425 if (f->header->head_seqnum == 0)
426 f->header->head_seqnum = htole64(r);
431 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
438 assert(type > 0 && type < _OBJECT_TYPE_MAX);
439 assert(size >= sizeof(ObjectHeader));
443 p = le64toh(f->header->tail_object_offset);
445 p = le64toh(f->header->header_size);
447 r = journal_file_move_to_object(f, -1, p, &tail);
451 p += ALIGN64(le64toh(tail->object.size));
454 r = journal_file_allocate(f, p, size);
458 r = journal_file_move_to(f, type, p, size, &t);
465 o->object.type = type;
466 o->object.size = htole64(size);
468 f->header->tail_object_offset = htole64(p);
469 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
477 static int journal_file_setup_data_hash_table(JournalFile *f) {
484 /* We estimate that we need 1 hash table entry per 768 of
485 journal file and we want to make sure we never get beyond
486 75% fill level. Calculate the hash table size for the
487 maximum file size based on these metrics. */
489 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
490 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
491 s = DEFAULT_DATA_HASH_TABLE_SIZE;
493 log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
495 r = journal_file_append_object(f,
496 OBJECT_DATA_HASH_TABLE,
497 offsetof(Object, hash_table.items) + s,
502 memset(o->hash_table.items, 0, s);
504 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505 f->header->data_hash_table_size = htole64(s);
510 static int journal_file_setup_field_hash_table(JournalFile *f) {
517 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518 r = journal_file_append_object(f,
519 OBJECT_FIELD_HASH_TABLE,
520 offsetof(Object, hash_table.items) + s,
525 memset(o->hash_table.items, 0, s);
527 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528 f->header->field_hash_table_size = htole64(s);
533 static int journal_file_map_data_hash_table(JournalFile *f) {
540 p = le64toh(f->header->data_hash_table_offset);
541 s = le64toh(f->header->data_hash_table_size);
543 r = journal_file_move_to(f,
544 OBJECT_DATA_HASH_TABLE,
550 f->data_hash_table = t;
554 static int journal_file_map_field_hash_table(JournalFile *f) {
561 p = le64toh(f->header->field_hash_table_offset);
562 s = le64toh(f->header->field_hash_table_size);
564 r = journal_file_move_to(f,
565 OBJECT_FIELD_HASH_TABLE,
571 f->field_hash_table = t;
575 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
582 assert(o->object.type == OBJECT_DATA);
584 /* This might alter the window we are looking at */
586 o->data.next_hash_offset = o->data.next_field_offset = 0;
587 o->data.entry_offset = o->data.entry_array_offset = 0;
588 o->data.n_entries = 0;
590 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
591 p = le64toh(f->data_hash_table[h].tail_hash_offset);
593 /* Only entry in the hash table is easy */
594 f->data_hash_table[h].head_hash_offset = htole64(offset);
596 /* Move back to the previous data object, to patch in
599 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
603 o->data.next_hash_offset = htole64(offset);
606 f->data_hash_table[h].tail_hash_offset = htole64(offset);
608 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
609 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
614 int journal_file_find_data_object_with_hash(
616 const void *data, uint64_t size, uint64_t hash,
617 Object **ret, uint64_t *offset) {
619 uint64_t p, osize, h;
623 assert(data || size == 0);
625 osize = offsetof(Object, data.payload) + size;
627 if (f->header->data_hash_table_size == 0)
630 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
631 p = le64toh(f->data_hash_table[h].head_hash_offset);
636 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
640 if (le64toh(o->data.hash) != hash)
643 if (o->object.flags & OBJECT_COMPRESSED) {
647 l = le64toh(o->object.size);
648 if (l <= offsetof(Object, data.payload))
651 l -= offsetof(Object, data.payload);
653 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
657 memcmp(f->compress_buffer, data, size) == 0) {
668 return -EPROTONOSUPPORT;
671 } else if (le64toh(o->object.size) == osize &&
672 memcmp(o->data.payload, data, size) == 0) {
684 p = le64toh(o->data.next_hash_offset);
690 int journal_file_find_data_object(
692 const void *data, uint64_t size,
693 Object **ret, uint64_t *offset) {
698 assert(data || size == 0);
700 hash = hash64(data, size);
702 return journal_file_find_data_object_with_hash(f,
707 static int journal_file_append_data(
709 const void *data, uint64_t size,
710 Object **ret, uint64_t *offset) {
716 bool compressed = false;
719 assert(data || size == 0);
721 hash = hash64(data, size);
723 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
737 osize = offsetof(Object, data.payload) + size;
738 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
742 o->data.hash = htole64(hash);
746 size >= COMPRESSION_SIZE_THRESHOLD) {
749 compressed = compress_blob(data, size, o->data.payload, &rsize);
752 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
753 o->object.flags |= OBJECT_COMPRESSED;
755 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
760 if (!compressed && size > 0)
761 memcpy(o->data.payload, data, size);
763 r = journal_file_link_data(f, o, p, hash);
767 r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
771 /* The linking might have altered the window, so let's
772 * refresh our pointer */
773 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
786 uint64_t journal_file_entry_n_items(Object *o) {
788 assert(o->object.type == OBJECT_ENTRY);
790 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
793 static uint64_t journal_file_entry_array_n_items(Object *o) {
795 assert(o->object.type == OBJECT_ENTRY_ARRAY);
797 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
800 static int link_entry_into_array(JournalFile *f,
805 uint64_t n = 0, ap = 0, q, i, a, hidx;
814 i = hidx = le64toh(*idx);
817 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
821 n = journal_file_entry_array_n_items(o);
823 o->entry_array.items[i] = htole64(p);
824 *idx = htole64(hidx + 1);
830 a = le64toh(o->entry_array.next_entry_array_offset);
841 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
842 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
847 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
851 o->entry_array.items[i] = htole64(p);
856 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
860 o->entry_array.next_entry_array_offset = htole64(q);
863 *idx = htole64(hidx + 1);
868 static int link_entry_into_array_plus_one(JournalFile *f,
887 i = htole64(le64toh(*idx) - 1);
888 r = link_entry_into_array(f, first, &i, p);
893 *idx = htole64(le64toh(*idx) + 1);
897 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
904 p = le64toh(o->entry.items[i].object_offset);
908 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
912 return link_entry_into_array_plus_one(f,
913 &o->data.entry_offset,
914 &o->data.entry_array_offset,
919 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
926 assert(o->object.type == OBJECT_ENTRY);
928 __sync_synchronize();
930 /* Link up the entry itself */
931 r = link_entry_into_array(f,
932 &f->header->entry_array_offset,
933 &f->header->n_entries,
938 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
940 if (f->header->head_entry_realtime == 0)
941 f->header->head_entry_realtime = o->entry.realtime;
943 f->header->tail_entry_realtime = o->entry.realtime;
944 f->header->tail_entry_monotonic = o->entry.monotonic;
946 f->tail_entry_monotonic_valid = true;
948 /* Link up the items */
949 n = journal_file_entry_n_items(o);
950 for (i = 0; i < n; i++) {
951 r = journal_file_link_entry_item(f, o, offset, i);
959 static int journal_file_append_entry_internal(
961 const dual_timestamp *ts,
963 const EntryItem items[], unsigned n_items,
965 Object **ret, uint64_t *offset) {
972 assert(items || n_items == 0);
975 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
977 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
981 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
982 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
983 o->entry.realtime = htole64(ts->realtime);
984 o->entry.monotonic = htole64(ts->monotonic);
985 o->entry.xor_hash = htole64(xor_hash);
986 o->entry.boot_id = f->header->boot_id;
988 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
992 r = journal_file_link_entry(f, o, np);
1005 void journal_file_post_change(JournalFile *f) {
1008 /* inotify() does not receive IN_MODIFY events from file
1009 * accesses done via mmap(). After each access we hence
1010 * trigger IN_MODIFY by truncating the journal file to its
1011 * current size which triggers IN_MODIFY. */
1013 __sync_synchronize();
1015 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1016 log_error("Failed to to truncate file to its own size: %m");
1019 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1023 uint64_t xor_hash = 0;
1024 struct dual_timestamp _ts;
1027 assert(iovec || n_iovec == 0);
1033 dual_timestamp_get(&_ts);
1037 if (f->tail_entry_monotonic_valid &&
1038 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1041 r = journal_file_maybe_append_tag(f, ts->realtime);
1045 /* alloca() can't take 0, hence let's allocate at least one */
1046 items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1048 for (i = 0; i < n_iovec; i++) {
1052 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1056 xor_hash ^= le64toh(o->data.hash);
1057 items[i].object_offset = htole64(p);
1058 items[i].hash = o->data.hash;
1061 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1063 journal_file_post_change(f);
1068 static int generic_array_get(JournalFile *f,
1071 Object **ret, uint64_t *offset) {
1083 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1087 n = journal_file_entry_array_n_items(o);
1089 p = le64toh(o->entry_array.items[i]);
1094 a = le64toh(o->entry_array.next_entry_array_offset);
1097 if (a <= 0 || p <= 0)
1100 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1113 static int generic_array_get_plus_one(JournalFile *f,
1117 Object **ret, uint64_t *offset) {
1126 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1139 return generic_array_get(f, first, i-1, ret, offset);
1148 static int generic_array_bisect(JournalFile *f,
1152 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1153 direction_t direction,
1158 uint64_t a, p, t = 0, i = 0, last_p = 0;
1159 bool subtract_one = false;
1160 Object *o, *array = NULL;
1164 assert(test_object);
1168 uint64_t left, right, k, lp;
1170 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1174 k = journal_file_entry_array_n_items(array);
1180 lp = p = le64toh(array->entry_array.items[i]);
1184 r = test_object(f, p, needle);
1188 if (r == TEST_FOUND)
1189 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1191 if (r == TEST_RIGHT) {
1195 if (left == right) {
1196 if (direction == DIRECTION_UP)
1197 subtract_one = true;
1203 assert(left < right);
1205 i = (left + right) / 2;
1206 p = le64toh(array->entry_array.items[i]);
1210 r = test_object(f, p, needle);
1214 if (r == TEST_FOUND)
1215 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1217 if (r == TEST_RIGHT)
1225 if (direction == DIRECTION_UP) {
1227 subtract_one = true;
1238 a = le64toh(array->entry_array.next_entry_array_offset);
1244 if (subtract_one && t == 0 && i == 0)
1247 if (subtract_one && i == 0)
1249 else if (subtract_one)
1250 p = le64toh(array->entry_array.items[i-1]);
1252 p = le64toh(array->entry_array.items[i]);
1254 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1265 *idx = t + i + (subtract_one ? -1 : 0);
1270 static int generic_array_bisect_plus_one(JournalFile *f,
1275 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1276 direction_t direction,
1282 bool step_back = false;
1286 assert(test_object);
1291 /* This bisects the array in object 'first', but first checks
1293 r = test_object(f, extra, needle);
1297 if (r == TEST_FOUND)
1298 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1300 /* if we are looking with DIRECTION_UP then we need to first
1301 see if in the actual array there is a matching entry, and
1302 return the last one of that. But if there isn't any we need
1303 to return this one. Hence remember this, and return it
1306 step_back = direction == DIRECTION_UP;
1308 if (r == TEST_RIGHT) {
1309 if (direction == DIRECTION_DOWN)
1315 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1317 if (r == 0 && step_back)
1326 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1342 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1348 else if (p < needle)
1354 int journal_file_move_to_entry_by_offset(
1357 direction_t direction,
1361 return generic_array_bisect(f,
1362 le64toh(f->header->entry_array_offset),
1363 le64toh(f->header->n_entries),
1371 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1378 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1382 if (le64toh(o->entry.seqnum) == needle)
1384 else if (le64toh(o->entry.seqnum) < needle)
1390 int journal_file_move_to_entry_by_seqnum(
1393 direction_t direction,
1397 return generic_array_bisect(f,
1398 le64toh(f->header->entry_array_offset),
1399 le64toh(f->header->n_entries),
1406 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1413 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1417 if (le64toh(o->entry.realtime) == needle)
1419 else if (le64toh(o->entry.realtime) < needle)
1425 int journal_file_move_to_entry_by_realtime(
1428 direction_t direction,
1432 return generic_array_bisect(f,
1433 le64toh(f->header->entry_array_offset),
1434 le64toh(f->header->n_entries),
1436 test_object_realtime,
1441 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1448 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1452 if (le64toh(o->entry.monotonic) == needle)
1454 else if (le64toh(o->entry.monotonic) < needle)
1460 int journal_file_move_to_entry_by_monotonic(
1464 direction_t direction,
1468 char t[9+32+1] = "_BOOT_ID=";
1474 sd_id128_to_string(boot_id, t + 9);
1475 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1481 return generic_array_bisect_plus_one(f,
1482 le64toh(o->data.entry_offset),
1483 le64toh(o->data.entry_array_offset),
1484 le64toh(o->data.n_entries),
1486 test_object_monotonic,
1491 int journal_file_next_entry(
1493 Object *o, uint64_t p,
1494 direction_t direction,
1495 Object **ret, uint64_t *offset) {
1501 assert(p > 0 || !o);
1503 n = le64toh(f->header->n_entries);
1508 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1510 if (o->object.type != OBJECT_ENTRY)
1513 r = generic_array_bisect(f,
1514 le64toh(f->header->entry_array_offset),
1515 le64toh(f->header->n_entries),
1524 if (direction == DIRECTION_DOWN) {
1537 /* And jump to it */
1538 return generic_array_get(f,
1539 le64toh(f->header->entry_array_offset),
1544 int journal_file_skip_entry(
1546 Object *o, uint64_t p,
1548 Object **ret, uint64_t *offset) {
1557 if (o->object.type != OBJECT_ENTRY)
1560 r = generic_array_bisect(f,
1561 le64toh(f->header->entry_array_offset),
1562 le64toh(f->header->n_entries),
1571 /* Calculate new index */
1573 if ((uint64_t) -skip >= i)
1576 i = i - (uint64_t) -skip;
1578 i += (uint64_t) skip;
1580 n = le64toh(f->header->n_entries);
1587 return generic_array_get(f,
1588 le64toh(f->header->entry_array_offset),
1593 int journal_file_next_entry_for_data(
1595 Object *o, uint64_t p,
1596 uint64_t data_offset,
1597 direction_t direction,
1598 Object **ret, uint64_t *offset) {
1605 assert(p > 0 || !o);
1607 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1611 n = le64toh(d->data.n_entries);
1616 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1618 if (o->object.type != OBJECT_ENTRY)
1621 r = generic_array_bisect_plus_one(f,
1622 le64toh(d->data.entry_offset),
1623 le64toh(d->data.entry_array_offset),
1624 le64toh(d->data.n_entries),
1634 if (direction == DIRECTION_DOWN) {
1648 return generic_array_get_plus_one(f,
1649 le64toh(d->data.entry_offset),
1650 le64toh(d->data.entry_array_offset),
1655 int journal_file_move_to_entry_by_offset_for_data(
1657 uint64_t data_offset,
1659 direction_t direction,
1660 Object **ret, uint64_t *offset) {
1667 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1671 return generic_array_bisect_plus_one(f,
1672 le64toh(d->data.entry_offset),
1673 le64toh(d->data.entry_array_offset),
1674 le64toh(d->data.n_entries),
1681 int journal_file_move_to_entry_by_monotonic_for_data(
1683 uint64_t data_offset,
1686 direction_t direction,
1687 Object **ret, uint64_t *offset) {
1689 char t[9+32+1] = "_BOOT_ID=";
1696 /* First, seek by time */
1697 sd_id128_to_string(boot_id, t + 9);
1698 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1704 r = generic_array_bisect_plus_one(f,
1705 le64toh(o->data.entry_offset),
1706 le64toh(o->data.entry_array_offset),
1707 le64toh(o->data.n_entries),
1709 test_object_monotonic,
1715 /* And now, continue seeking until we find an entry that
1716 * exists in both bisection arrays */
1722 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1726 r = generic_array_bisect_plus_one(f,
1727 le64toh(d->data.entry_offset),
1728 le64toh(d->data.entry_array_offset),
1729 le64toh(d->data.n_entries),
1737 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1741 r = generic_array_bisect_plus_one(f,
1742 le64toh(o->data.entry_offset),
1743 le64toh(o->data.entry_array_offset),
1744 le64toh(o->data.n_entries),
1768 int journal_file_move_to_entry_by_seqnum_for_data(
1770 uint64_t data_offset,
1772 direction_t direction,
1773 Object **ret, uint64_t *offset) {
1780 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1784 return generic_array_bisect_plus_one(f,
1785 le64toh(d->data.entry_offset),
1786 le64toh(d->data.entry_array_offset),
1787 le64toh(d->data.n_entries),
1794 int journal_file_move_to_entry_by_realtime_for_data(
1796 uint64_t data_offset,
1798 direction_t direction,
1799 Object **ret, uint64_t *offset) {
1806 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1810 return generic_array_bisect_plus_one(f,
1811 le64toh(d->data.entry_offset),
1812 le64toh(d->data.entry_array_offset),
1813 le64toh(d->data.n_entries),
1815 test_object_realtime,
1820 static void *fsprg_state(JournalFile *f) {
1824 if (!f->authenticate)
1827 a = le64toh(f->fsprg_header->header_size);
1828 b = le64toh(f->fsprg_header->state_size);
1830 if (a + b > f->fsprg_size)
1833 return (uint8_t*) f->fsprg_header + a;
1836 static uint64_t journal_file_tag_seqnum(JournalFile *f) {
1841 r = le64toh(f->header->n_tags) + 1;
1842 f->header->n_tags = htole64(r);
1847 int journal_file_append_tag(JournalFile *f) {
1854 if (!f->authenticate)
1857 if (!f->hmac_running)
1860 log_debug("Writing tag for epoch %llu\n", (unsigned long long) FSPRG_GetEpoch(fsprg_state(f)));
1864 r = journal_file_append_object(f, OBJECT_TAG, sizeof(struct TagObject), &o, &p);
1868 o->tag.seqnum = htole64(journal_file_tag_seqnum(f));
1870 /* Add the tag object itself, so that we can protect its
1871 * header. This will exclude the actual hash value in it */
1872 r = journal_file_hmac_put_object(f, OBJECT_TAG, p);
1876 /* Get the HMAC tag and store it in the object */
1877 memcpy(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH);
1878 f->hmac_running = false;
1883 static int journal_file_hmac_start(JournalFile *f) {
1884 uint8_t key[256 / 8]; /* Let's pass 256 bit from FSPRG to HMAC */
1888 if (!f->authenticate)
1891 if (f->hmac_running)
1894 /* Prepare HMAC for next cycle */
1895 gcry_md_reset(f->hmac);
1896 FSPRG_GetKey(fsprg_state(f), key, sizeof(key), 0);
1897 gcry_md_setkey(f->hmac, key, sizeof(key));
1899 f->hmac_running = true;
1904 static int journal_file_get_epoch(JournalFile *f, uint64_t realtime, uint64_t *epoch) {
1909 assert(f->authenticate);
1911 if (le64toh(f->fsprg_header->fsprg_start_usec) == 0 ||
1912 le64toh(f->fsprg_header->fsprg_interval_usec) == 0)
1915 if (realtime < le64toh(f->fsprg_header->fsprg_start_usec))
1918 t = realtime - le64toh(f->fsprg_header->fsprg_start_usec);
1919 t = t / le64toh(f->fsprg_header->fsprg_interval_usec);
1925 static int journal_file_need_evolve(JournalFile *f, uint64_t realtime) {
1926 uint64_t goal, epoch;
1930 if (!f->authenticate)
1933 r = journal_file_get_epoch(f, realtime, &goal);
1937 epoch = FSPRG_GetEpoch(fsprg_state(f));
1941 return epoch != goal;
1944 static int journal_file_evolve(JournalFile *f, uint64_t realtime) {
1945 uint64_t goal, epoch;
1950 if (!f->authenticate)
1953 r = journal_file_get_epoch(f, realtime, &goal);
1957 epoch = FSPRG_GetEpoch(fsprg_state(f));
1959 log_debug("Evolving FSPRG key from epoch %llu to %llu.", (unsigned long long) epoch, (unsigned long long) goal);
1967 FSPRG_Evolve(fsprg_state(f));
1968 epoch = FSPRG_GetEpoch(fsprg_state(f));
1972 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime) {
1977 if (!f->authenticate)
1980 r = journal_file_need_evolve(f, realtime);
1984 r = journal_file_append_tag(f);
1988 r = journal_file_evolve(f, realtime);
1992 r = journal_file_hmac_start(f);
1999 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p) {
2005 if (!f->authenticate)
2008 r = journal_file_hmac_start(f);
2012 r = journal_file_move_to_object(f, type, p, &o);
2016 gcry_md_write(f->hmac, o, offsetof(ObjectHeader, payload));
2018 switch (o->object.type) {
2021 /* All but: hash and payload are mutable */
2022 gcry_md_write(f->hmac, &o->data.hash, sizeof(o->data.hash));
2023 gcry_md_write(f->hmac, o->data.payload, le64toh(o->object.size) - offsetof(DataObject, payload));
2028 gcry_md_write(f->hmac, &o->entry.seqnum, le64toh(o->object.size) - offsetof(EntryObject, seqnum));
2031 case OBJECT_FIELD_HASH_TABLE:
2032 case OBJECT_DATA_HASH_TABLE:
2033 case OBJECT_ENTRY_ARRAY:
2034 /* Nothing: everything is mutable */
2038 /* All but the tag itself */
2039 gcry_md_write(f->hmac, &o->tag.seqnum, sizeof(o->tag.seqnum));
2048 static int journal_file_hmac_put_header(JournalFile *f) {
2053 if (!f->authenticate)
2056 r = journal_file_hmac_start(f);
2060 /* All but state+reserved, boot_id, arena_size,
2061 * tail_object_offset, n_objects, n_entries, tail_seqnum,
2062 * head_entry_realtime, tail_entry_realtime,
2063 * tail_entry_monotonic, n_data, n_fields, header_tag */
2065 gcry_md_write(f->hmac, f->header->signature, offsetof(Header, state) - offsetof(Header, signature));
2066 gcry_md_write(f->hmac, &f->header->file_id, offsetof(Header, boot_id) - offsetof(Header, file_id));
2067 gcry_md_write(f->hmac, &f->header->seqnum_id, offsetof(Header, arena_size) - offsetof(Header, seqnum_id));
2068 gcry_md_write(f->hmac, &f->header->data_hash_table_offset, offsetof(Header, tail_object_offset) - offsetof(Header, data_hash_table_offset));
2069 gcry_md_write(f->hmac, &f->header->head_seqnum, offsetof(Header, head_entry_realtime) - offsetof(Header, head_seqnum));
2074 static int journal_file_load_fsprg(JournalFile *f) {
2078 FSPRGHeader *m = NULL;
2083 if (!f->authenticate)
2086 r = sd_id128_get_machine(&machine);
2090 if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/fsprg",
2091 SD_ID128_FORMAT_VAL(machine)) < 0)
2094 fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY, 0600);
2096 log_error("Failed to open %s: %m", p);
2101 if (fstat(fd, &st) < 0) {
2106 if (st.st_size < (off_t) sizeof(FSPRGHeader)) {
2111 m = mmap(NULL, PAGE_ALIGN(sizeof(FSPRGHeader)), PROT_READ, MAP_SHARED, fd, 0);
2112 if (m == MAP_FAILED) {
2118 if (memcmp(m->signature, FSPRG_HEADER_SIGNATURE, 8) != 0) {
2123 if (m->incompatible_flags != 0) {
2124 r = -EPROTONOSUPPORT;
2128 if (le64toh(m->header_size) < sizeof(FSPRGHeader)) {
2133 if (le64toh(m->state_size) != FSPRG_stateinbytes(m->secpar)) {
2138 f->fsprg_size = le64toh(m->header_size) + le64toh(m->state_size);
2139 if ((uint64_t) st.st_size < f->fsprg_size) {
2144 if (!sd_id128_equal(machine, m->machine_id)) {
2149 if (le64toh(m->fsprg_start_usec) <= 0 ||
2150 le64toh(m->fsprg_interval_usec) <= 0) {
2155 f->fsprg_header = mmap(NULL, PAGE_ALIGN(f->fsprg_size), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2156 if (f->fsprg_header == MAP_FAILED) {
2157 f->fsprg_header = NULL;
2166 munmap(m, PAGE_ALIGN(sizeof(FSPRGHeader)));
2169 close_nointr_nofail(fd);
2175 static int journal_file_setup_hmac(JournalFile *f) {
2178 if (!f->authenticate)
2181 e = gcry_md_open(&f->hmac, GCRY_MD_SHA256, GCRY_MD_FLAG_HMAC);
2188 static int journal_file_append_first_tag(JournalFile *f) {
2192 if (!f->authenticate)
2195 log_debug("Calculating first tag...");
2197 r = journal_file_hmac_put_header(f);
2201 p = le64toh(f->header->field_hash_table_offset);
2202 if (p < offsetof(Object, hash_table.items))
2204 p -= offsetof(Object, hash_table.items);
2206 r = journal_file_hmac_put_object(f, OBJECT_FIELD_HASH_TABLE, p);
2210 p = le64toh(f->header->data_hash_table_offset);
2211 if (p < offsetof(Object, hash_table.items))
2213 p -= offsetof(Object, hash_table.items);
2215 r = journal_file_hmac_put_object(f, OBJECT_DATA_HASH_TABLE, p);
2219 r = journal_file_append_tag(f);
2226 void journal_file_dump(JournalFile *f) {
2233 journal_file_print_header(f);
2235 p = le64toh(f->header->header_size);
2237 r = journal_file_move_to_object(f, -1, p, &o);
2241 switch (o->object.type) {
2244 printf("Type: OBJECT_UNUSED\n");
2248 printf("Type: OBJECT_DATA\n");
2252 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
2253 (unsigned long long) le64toh(o->entry.seqnum),
2254 (unsigned long long) le64toh(o->entry.monotonic),
2255 (unsigned long long) le64toh(o->entry.realtime));
2258 case OBJECT_FIELD_HASH_TABLE:
2259 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2262 case OBJECT_DATA_HASH_TABLE:
2263 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2266 case OBJECT_ENTRY_ARRAY:
2267 printf("Type: OBJECT_ENTRY_ARRAY\n");
2271 printf("Type: OBJECT_TAG %llu\n",
2272 (unsigned long long) le64toh(o->tag.seqnum));
2276 if (o->object.flags & OBJECT_COMPRESSED)
2277 printf("Flags: COMPRESSED\n");
2279 if (p == le64toh(f->header->tail_object_offset))
2282 p = p + ALIGN64(le64toh(o->object.size));
2287 log_error("File corrupt");
2290 void journal_file_print_header(JournalFile *f) {
2291 char a[33], b[33], c[33];
2292 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2296 printf("File Path: %s\n"
2300 "Sequential Number ID: %s\n"
2302 "Compatible Flags:%s%s\n"
2303 "Incompatible Flags:%s%s\n"
2304 "Header size: %llu\n"
2305 "Arena size: %llu\n"
2306 "Data Hash Table Size: %llu\n"
2307 "Field Hash Table Size: %llu\n"
2309 "Entry Objects: %llu\n"
2310 "Rotate Suggested: %s\n"
2311 "Head Sequential Number: %llu\n"
2312 "Tail Sequential Number: %llu\n"
2313 "Head Realtime Timestamp: %s\n"
2314 "Tail Realtime Timestamp: %s\n",
2316 sd_id128_to_string(f->header->file_id, a),
2317 sd_id128_to_string(f->header->machine_id, b),
2318 sd_id128_to_string(f->header->boot_id, c),
2319 sd_id128_to_string(f->header->seqnum_id, c),
2320 f->header->state == STATE_OFFLINE ? "offline" :
2321 f->header->state == STATE_ONLINE ? "online" :
2322 f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
2323 (f->header->compatible_flags & HEADER_COMPATIBLE_AUTHENTICATED) ? " AUTHENTICATED" : "",
2324 (f->header->compatible_flags & ~HEADER_COMPATIBLE_AUTHENTICATED) ? " ???" : "",
2325 (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
2326 (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2327 (unsigned long long) le64toh(f->header->header_size),
2328 (unsigned long long) le64toh(f->header->arena_size),
2329 (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2330 (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2331 (unsigned long long) le64toh(f->header->n_objects),
2332 (unsigned long long) le64toh(f->header->n_entries),
2333 yes_no(journal_file_rotate_suggested(f)),
2334 (unsigned long long) le64toh(f->header->head_seqnum),
2335 (unsigned long long) le64toh(f->header->tail_seqnum),
2336 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2337 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
2339 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2340 printf("Data Objects: %llu\n"
2341 "Data Hash Table Fill: %.1f%%\n",
2342 (unsigned long long) le64toh(f->header->n_data),
2343 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2345 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2346 printf("Field Objects: %llu\n"
2347 "Field Hash Table Fill: %.1f%%\n",
2348 (unsigned long long) le64toh(f->header->n_fields),
2349 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2352 int journal_file_open(
2358 JournalMetrics *metrics,
2359 MMapCache *mmap_cache,
2360 JournalFile *template,
2361 JournalFile **ret) {
2365 bool newly_created = false;
2369 if ((flags & O_ACCMODE) != O_RDONLY &&
2370 (flags & O_ACCMODE) != O_RDWR)
2373 if (!endswith(fname, ".journal"))
2376 f = new0(JournalFile, 1);
2384 f->prot = prot_from_flags(flags);
2385 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2386 f->compress = compress;
2387 f->authenticate = authenticate;
2390 f->mmap = mmap_cache_ref(mmap_cache);
2392 /* One context for each type, plus the zeroth catchall
2393 * context. One fd for the file plus one for each type
2394 * (which we need during verification */
2395 f->mmap = mmap_cache_new(_OBJECT_TYPE_MAX, 1 + _OBJECT_TYPE_MAX);
2402 f->path = strdup(fname);
2408 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2414 if (fstat(f->fd, &f->last_stat) < 0) {
2419 if (f->last_stat.st_size == 0 && f->writable) {
2420 newly_created = true;
2422 /* Try to load the FSPRG state, and if we can't, then
2423 * just don't do authentication */
2424 r = journal_file_load_fsprg(f);
2426 f->authenticate = false;
2428 r = journal_file_init_header(f, template);
2432 if (fstat(f->fd, &f->last_stat) < 0) {
2438 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2443 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2444 if (f->header == MAP_FAILED) {
2450 if (!newly_created) {
2451 r = journal_file_verify_header(f);
2456 if (!newly_created && f->writable) {
2457 r = journal_file_load_fsprg(f);
2464 journal_default_metrics(metrics, f->fd);
2465 f->metrics = *metrics;
2466 } else if (template)
2467 f->metrics = template->metrics;
2469 r = journal_file_refresh_header(f);
2473 r = journal_file_setup_hmac(f);
2478 if (newly_created) {
2479 r = journal_file_setup_field_hash_table(f);
2483 r = journal_file_setup_data_hash_table(f);
2487 r = journal_file_append_first_tag(f);
2492 r = journal_file_map_field_hash_table(f);
2496 r = journal_file_map_data_hash_table(f);
2506 journal_file_close(f);
2511 int journal_file_rotate(JournalFile **f, bool compress, bool authenticate) {
2514 JournalFile *old_file, *new_file = NULL;
2522 if (!old_file->writable)
2525 if (!endswith(old_file->path, ".journal"))
2528 l = strlen(old_file->path);
2530 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2534 memcpy(p, old_file->path, l - 8);
2536 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2537 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2538 "-%016llx-%016llx.journal",
2539 (unsigned long long) le64toh((*f)->header->tail_seqnum),
2540 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2542 r = rename(old_file->path, p);
2548 old_file->header->state = STATE_ARCHIVED;
2550 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, authenticate, NULL, old_file->mmap, old_file, &new_file);
2551 journal_file_close(old_file);
2557 int journal_file_open_reliably(
2563 JournalMetrics *metrics,
2565 JournalFile *template,
2566 JournalFile **ret) {
2572 r = journal_file_open(fname, flags, mode, compress, authenticate, metrics, mmap, template, ret);
2573 if (r != -EBADMSG && /* corrupted */
2574 r != -ENODATA && /* truncated */
2575 r != -EHOSTDOWN && /* other machine */
2576 r != -EPROTONOSUPPORT && /* incompatible feature */
2577 r != -EBUSY && /* unclean shutdown */
2578 r != -ESHUTDOWN /* already archived */)
2581 if ((flags & O_ACCMODE) == O_RDONLY)
2584 if (!(flags & O_CREAT))
2587 if (!endswith(fname, ".journal"))
2590 /* The file is corrupted. Rotate it away and try it again (but only once) */
2593 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2595 (unsigned long long) now(CLOCK_REALTIME),
2599 r = rename(fname, p);
2604 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2606 return journal_file_open(fname, flags, mode, compress, authenticate, metrics, mmap, template, ret);
2609 struct vacuum_info {
2614 sd_id128_t seqnum_id;
2620 static int vacuum_compare(const void *_a, const void *_b) {
2621 const struct vacuum_info *a, *b;
2626 if (a->have_seqnum && b->have_seqnum &&
2627 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2628 if (a->seqnum < b->seqnum)
2630 else if (a->seqnum > b->seqnum)
2636 if (a->realtime < b->realtime)
2638 else if (a->realtime > b->realtime)
2640 else if (a->have_seqnum && b->have_seqnum)
2641 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2643 return strcmp(a->filename, b->filename);
2646 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2649 struct vacuum_info *list = NULL;
2650 unsigned n_list = 0, n_allocated = 0, i;
2658 d = opendir(directory);
2664 struct dirent buf, *de;
2668 unsigned long long seqnum = 0, realtime;
2669 sd_id128_t seqnum_id;
2672 k = readdir_r(d, &buf, &de);
2681 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2684 if (!S_ISREG(st.st_mode))
2687 q = strlen(de->d_name);
2689 if (endswith(de->d_name, ".journal")) {
2691 /* Vacuum archived files */
2693 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2696 if (de->d_name[q-8-16-1] != '-' ||
2697 de->d_name[q-8-16-1-16-1] != '-' ||
2698 de->d_name[q-8-16-1-16-1-32-1] != '@')
2701 p = strdup(de->d_name);
2707 de->d_name[q-8-16-1-16-1] = 0;
2708 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2713 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2720 } else if (endswith(de->d_name, ".journal~")) {
2721 unsigned long long tmp;
2723 /* Vacuum corrupted files */
2725 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2728 if (de->d_name[q-1-8-16-1] != '-' ||
2729 de->d_name[q-1-8-16-1-16-1] != '@')
2732 p = strdup(de->d_name);
2738 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2743 have_seqnum = false;
2747 if (n_list >= n_allocated) {
2748 struct vacuum_info *j;
2750 n_allocated = MAX(n_allocated * 2U, 8U);
2751 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2761 list[n_list].filename = p;
2762 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2763 list[n_list].seqnum = seqnum;
2764 list[n_list].realtime = realtime;
2765 list[n_list].seqnum_id = seqnum_id;
2766 list[n_list].have_seqnum = have_seqnum;
2768 sum += list[n_list].usage;
2774 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2776 for(i = 0; i < n_list; i++) {
2779 if (fstatvfs(dirfd(d), &ss) < 0) {
2784 if (sum <= max_use &&
2785 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2788 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2789 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2790 sum -= list[i].usage;
2791 } else if (errno != ENOENT)
2792 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2796 for (i = 0; i < n_list; i++)
2797 free(list[i].filename);
2807 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2809 uint64_t q, xor_hash = 0;
2822 ts.monotonic = le64toh(o->entry.monotonic);
2823 ts.realtime = le64toh(o->entry.realtime);
2825 if (to->tail_entry_monotonic_valid &&
2826 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2829 n = journal_file_entry_n_items(o);
2830 items = alloca(sizeof(EntryItem) * n);
2832 for (i = 0; i < n; i++) {
2839 q = le64toh(o->entry.items[i].object_offset);
2840 le_hash = o->entry.items[i].hash;
2842 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2846 if (le_hash != o->data.hash)
2849 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2852 /* We hit the limit on 32bit machines */
2853 if ((uint64_t) t != l)
2856 if (o->object.flags & OBJECT_COMPRESSED) {
2860 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2863 data = from->compress_buffer;
2866 return -EPROTONOSUPPORT;
2869 data = o->data.payload;
2871 r = journal_file_append_data(to, data, l, &u, &h);
2875 xor_hash ^= le64toh(u->data.hash);
2876 items[i].object_offset = htole64(h);
2877 items[i].hash = u->data.hash;
2879 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2884 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2887 void journal_default_metrics(JournalMetrics *m, int fd) {
2888 uint64_t fs_size = 0;
2890 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2895 if (fstatvfs(fd, &ss) >= 0)
2896 fs_size = ss.f_frsize * ss.f_blocks;
2898 if (m->max_use == (uint64_t) -1) {
2901 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2903 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2904 m->max_use = DEFAULT_MAX_USE_UPPER;
2906 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2907 m->max_use = DEFAULT_MAX_USE_LOWER;
2909 m->max_use = DEFAULT_MAX_USE_LOWER;
2911 m->max_use = PAGE_ALIGN(m->max_use);
2913 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2914 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2917 if (m->max_size == (uint64_t) -1) {
2918 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2920 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2921 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2923 m->max_size = PAGE_ALIGN(m->max_size);
2925 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2926 m->max_size = JOURNAL_FILE_SIZE_MIN;
2928 if (m->max_size*2 > m->max_use)
2929 m->max_use = m->max_size*2;
2931 if (m->min_size == (uint64_t) -1)
2932 m->min_size = JOURNAL_FILE_SIZE_MIN;
2934 m->min_size = PAGE_ALIGN(m->min_size);
2936 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2937 m->min_size = JOURNAL_FILE_SIZE_MIN;
2939 if (m->min_size > m->max_size)
2940 m->max_size = m->min_size;
2943 if (m->keep_free == (uint64_t) -1) {
2946 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2948 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2949 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2952 m->keep_free = DEFAULT_KEEP_FREE;
2955 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2956 format_bytes(a, sizeof(a), m->max_use),
2957 format_bytes(b, sizeof(b), m->max_size),
2958 format_bytes(c, sizeof(c), m->min_size),
2959 format_bytes(d, sizeof(d), m->keep_free));
2962 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2967 if (f->header->head_entry_realtime == 0)
2970 *from = le64toh(f->header->head_entry_realtime);
2974 if (f->header->tail_entry_realtime == 0)
2977 *to = le64toh(f->header->tail_entry_realtime);
2983 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2984 char t[9+32+1] = "_BOOT_ID=";
2992 sd_id128_to_string(boot_id, t + 9);
2994 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2998 if (le64toh(o->data.n_entries) <= 0)
3002 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3006 *from = le64toh(o->entry.monotonic);
3010 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3014 r = generic_array_get_plus_one(f,
3015 le64toh(o->data.entry_offset),
3016 le64toh(o->data.entry_array_offset),
3017 le64toh(o->data.n_entries)-1,
3022 *to = le64toh(o->entry.monotonic);
3028 bool journal_file_rotate_suggested(JournalFile *f) {
3031 /* If we gained new header fields we gained new features,
3032 * hence suggest a rotation */
3033 if (le64toh(f->header->header_size) < sizeof(Header)) {
3034 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3038 /* Let's check if the hash tables grew over a certain fill
3039 * level (75%, borrowing this value from Java's hash table
3040 * implementation), and if so suggest a rotation. To calculate
3041 * the fill level we need the n_data field, which only exists
3042 * in newer versions. */
3044 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3045 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3046 log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
3048 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3049 (unsigned long long) le64toh(f->header->n_data),
3050 (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
3051 (unsigned long long) (f->last_stat.st_size),
3052 (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
3056 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3057 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3058 log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
3060 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3061 (unsigned long long) le64toh(f->header->n_fields),
3062 (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));