1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
36 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
67 #define JOURNAL_HEADER_CONTAINS(h, field) \
68 (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
70 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime);
71 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p);
73 void journal_file_close(JournalFile *f) {
78 /* Write the final tag */
80 journal_file_append_tag(f);
82 /* Sync everything to disk, before we mark the file offline */
83 for (t = 0; t < _WINDOW_MAX; t++)
84 if (f->windows[t].ptr)
85 munmap(f->windows[t].ptr, f->windows[t].size);
87 if (f->writable && f->fd >= 0)
91 /* Mark the file offline. Don't override the archived state if it already is set */
92 if (f->writable && f->header->state == STATE_ONLINE)
93 f->header->state = STATE_OFFLINE;
95 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
99 close_nointr_nofail(f->fd);
104 free(f->compress_buffer);
109 munmap(f->fsprg_header, PAGE_ALIGN(f->fsprg_size));
112 gcry_md_close(f->hmac);
118 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
126 memcpy(h.signature, HEADER_SIGNATURE, 8);
127 h.header_size = htole64(ALIGN64(sizeof(h)));
129 h.incompatible_flags =
130 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
133 htole32(f->authenticate ? HEADER_COMPATIBLE_AUTHENTICATED : 0);
135 r = sd_id128_randomize(&h.file_id);
140 h.seqnum_id = template->header->seqnum_id;
141 h.tail_seqnum = template->header->tail_seqnum;
143 h.seqnum_id = h.file_id;
145 k = pwrite(f->fd, &h, sizeof(h), 0);
155 static int journal_file_refresh_header(JournalFile *f) {
161 r = sd_id128_get_machine(&f->header->machine_id);
165 r = sd_id128_get_boot(&boot_id);
169 if (sd_id128_equal(boot_id, f->header->boot_id))
170 f->tail_entry_monotonic_valid = true;
172 f->header->boot_id = boot_id;
174 f->header->state = STATE_ONLINE;
176 /* Sync the online state to disk */
177 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
183 static int journal_file_verify_header(JournalFile *f) {
186 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
189 /* In both read and write mode we refuse to open files with
190 * incompatible flags we don't know */
192 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
193 return -EPROTONOSUPPORT;
195 if (f->header->incompatible_flags != 0)
196 return -EPROTONOSUPPORT;
199 /* When open for writing we refuse to open files with
200 * compatible flags, too */
203 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_AUTHENTICATED) != 0)
204 return -EPROTONOSUPPORT;
206 if (f->header->compatible_flags != 0)
207 return -EPROTONOSUPPORT;
211 /* The first addition was n_data, so check that we are at least this large */
212 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
215 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
220 sd_id128_t machine_id;
223 r = sd_id128_get_machine(&machine_id);
227 if (!sd_id128_equal(machine_id, f->header->machine_id))
230 state = f->header->state;
232 if (state == STATE_ONLINE) {
233 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
235 } else if (state == STATE_ARCHIVED)
237 else if (state != STATE_OFFLINE) {
238 log_debug("Journal file %s has unknown state %u.", f->path, state);
243 f->compress = !!(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED);
244 f->authenticate = !!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED);
249 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
250 uint64_t old_size, new_size;
255 /* We assume that this file is not sparse, and we know that
256 * for sure, since we always call posix_fallocate()
260 le64toh(f->header->header_size) +
261 le64toh(f->header->arena_size);
263 new_size = PAGE_ALIGN(offset + size);
264 if (new_size < le64toh(f->header->header_size))
265 new_size = le64toh(f->header->header_size);
267 if (new_size <= old_size)
270 if (f->metrics.max_size > 0 &&
271 new_size > f->metrics.max_size)
274 if (new_size > f->metrics.min_size &&
275 f->metrics.keep_free > 0) {
278 if (fstatvfs(f->fd, &svfs) >= 0) {
281 available = svfs.f_bfree * svfs.f_bsize;
283 if (available >= f->metrics.keep_free)
284 available -= f->metrics.keep_free;
288 if (new_size - old_size > available)
293 /* Note that the glibc fallocate() fallback is very
294 inefficient, hence we try to minimize the allocation area
296 r = posix_fallocate(f->fd, old_size, new_size - old_size);
300 if (fstat(f->fd, &f->last_stat) < 0)
303 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
308 static int journal_file_map(
317 uint64_t woffset, wsize;
324 woffset = offset & ~((uint64_t) page_size() - 1ULL);
325 wsize = size + (offset - woffset);
326 wsize = PAGE_ALIGN(wsize);
328 /* Avoid SIGBUS on invalid accesses */
329 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
330 return -EADDRNOTAVAIL;
332 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
333 if (window == MAP_FAILED)
345 *ret = (uint8_t*) window + (offset - woffset);
350 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
359 assert(wt < _WINDOW_MAX);
361 if (offset + size > (uint64_t) f->last_stat.st_size) {
362 /* Hmm, out of range? Let's refresh the fstat() data
363 * first, before we trust that check. */
365 if (fstat(f->fd, &f->last_stat) < 0 ||
366 offset + size > (uint64_t) f->last_stat.st_size)
367 return -EADDRNOTAVAIL;
372 if (_likely_(w->ptr &&
373 w->offset <= offset &&
374 w->offset + w->size >= offset + size)) {
376 *ret = (uint8_t*) w->ptr + (offset - w->offset);
381 if (munmap(w->ptr, w->size) < 0)
385 w->size = w->offset = 0;
388 if (size < DEFAULT_WINDOW_SIZE) {
389 /* If the default window size is larger then what was
390 * asked for extend the mapping a bit in the hope to
391 * minimize needed remappings later on. We add half
392 * the window space before and half behind the
393 * requested mapping */
395 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
401 size = DEFAULT_WINDOW_SIZE;
405 if (offset + size > (uint64_t) f->last_stat.st_size)
406 size = (uint64_t) f->last_stat.st_size - offset;
409 return -EADDRNOTAVAIL;
411 r = journal_file_map(f,
413 &w->ptr, &w->offset, &w->size,
419 *ret = (uint8_t*) p + delta;
423 static bool verify_hash(Object *o) {
428 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
429 h1 = le64toh(o->data.hash);
430 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
431 } else if (o->object.type == OBJECT_FIELD) {
432 h1 = le64toh(o->field.hash);
433 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
440 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
448 assert(type < _OBJECT_TYPE_MAX);
450 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
455 s = le64toh(o->object.size);
457 if (s < sizeof(ObjectHeader))
460 if (type >= 0 && o->object.type != type)
463 if (s > sizeof(ObjectHeader)) {
464 r = journal_file_move_to(f, o->object.type, offset, s, &t);
478 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
483 r = le64toh(f->header->tail_seqnum) + 1;
486 /* If an external seqnum counter was passed, we update
487 * both the local and the external one, and set it to
488 * the maximum of both */
496 f->header->tail_seqnum = htole64(r);
498 if (f->header->head_seqnum == 0)
499 f->header->head_seqnum = htole64(r);
504 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
511 assert(size >= sizeof(ObjectHeader));
515 p = le64toh(f->header->tail_object_offset);
517 p = le64toh(f->header->header_size);
519 r = journal_file_move_to_object(f, -1, p, &tail);
523 p += ALIGN64(le64toh(tail->object.size));
526 r = journal_file_allocate(f, p, size);
530 r = journal_file_move_to(f, type, p, size, &t);
537 o->object.type = type;
538 o->object.size = htole64(size);
540 f->header->tail_object_offset = htole64(p);
541 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
549 static int journal_file_setup_data_hash_table(JournalFile *f) {
556 /* We estimate that we need 1 hash table entry per 768 of
557 journal file and we want to make sure we never get beyond
558 75% fill level. Calculate the hash table size for the
559 maximum file size based on these metrics. */
561 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
562 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
563 s = DEFAULT_DATA_HASH_TABLE_SIZE;
565 log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
567 r = journal_file_append_object(f,
568 OBJECT_DATA_HASH_TABLE,
569 offsetof(Object, hash_table.items) + s,
574 memset(o->hash_table.items, 0, s);
576 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
577 f->header->data_hash_table_size = htole64(s);
582 static int journal_file_setup_field_hash_table(JournalFile *f) {
589 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
590 r = journal_file_append_object(f,
591 OBJECT_FIELD_HASH_TABLE,
592 offsetof(Object, hash_table.items) + s,
597 memset(o->hash_table.items, 0, s);
599 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
600 f->header->field_hash_table_size = htole64(s);
605 static int journal_file_map_data_hash_table(JournalFile *f) {
612 p = le64toh(f->header->data_hash_table_offset);
613 s = le64toh(f->header->data_hash_table_size);
615 r = journal_file_move_to(f,
616 WINDOW_DATA_HASH_TABLE,
622 f->data_hash_table = t;
626 static int journal_file_map_field_hash_table(JournalFile *f) {
633 p = le64toh(f->header->field_hash_table_offset);
634 s = le64toh(f->header->field_hash_table_size);
636 r = journal_file_move_to(f,
637 WINDOW_FIELD_HASH_TABLE,
643 f->field_hash_table = t;
647 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
654 assert(o->object.type == OBJECT_DATA);
656 /* This might alter the window we are looking at */
658 o->data.next_hash_offset = o->data.next_field_offset = 0;
659 o->data.entry_offset = o->data.entry_array_offset = 0;
660 o->data.n_entries = 0;
662 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
663 p = le64toh(f->data_hash_table[h].tail_hash_offset);
665 /* Only entry in the hash table is easy */
666 f->data_hash_table[h].head_hash_offset = htole64(offset);
668 /* Move back to the previous data object, to patch in
671 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
675 o->data.next_hash_offset = htole64(offset);
678 f->data_hash_table[h].tail_hash_offset = htole64(offset);
680 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
681 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
686 int journal_file_find_data_object_with_hash(
688 const void *data, uint64_t size, uint64_t hash,
689 Object **ret, uint64_t *offset) {
691 uint64_t p, osize, h;
695 assert(data || size == 0);
697 osize = offsetof(Object, data.payload) + size;
699 if (f->header->data_hash_table_size == 0)
702 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
703 p = le64toh(f->data_hash_table[h].head_hash_offset);
708 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
712 if (le64toh(o->data.hash) != hash)
715 if (o->object.flags & OBJECT_COMPRESSED) {
719 l = le64toh(o->object.size);
720 if (l <= offsetof(Object, data.payload))
723 l -= offsetof(Object, data.payload);
725 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
729 memcmp(f->compress_buffer, data, size) == 0) {
740 return -EPROTONOSUPPORT;
743 } else if (le64toh(o->object.size) == osize &&
744 memcmp(o->data.payload, data, size) == 0) {
756 p = le64toh(o->data.next_hash_offset);
762 int journal_file_find_data_object(
764 const void *data, uint64_t size,
765 Object **ret, uint64_t *offset) {
770 assert(data || size == 0);
772 hash = hash64(data, size);
774 return journal_file_find_data_object_with_hash(f,
779 static int journal_file_append_data(
781 const void *data, uint64_t size,
782 Object **ret, uint64_t *offset) {
788 bool compressed = false;
791 assert(data || size == 0);
793 hash = hash64(data, size);
795 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
809 osize = offsetof(Object, data.payload) + size;
810 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
814 o->data.hash = htole64(hash);
818 size >= COMPRESSION_SIZE_THRESHOLD) {
821 compressed = compress_blob(data, size, o->data.payload, &rsize);
824 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
825 o->object.flags |= OBJECT_COMPRESSED;
827 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
832 if (!compressed && size > 0)
833 memcpy(o->data.payload, data, size);
835 r = journal_file_link_data(f, o, p, hash);
839 r = journal_file_hmac_put_object(f, OBJECT_DATA, p);
843 /* The linking might have altered the window, so let's
844 * refresh our pointer */
845 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
858 uint64_t journal_file_entry_n_items(Object *o) {
860 assert(o->object.type == OBJECT_ENTRY);
862 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
865 static uint64_t journal_file_entry_array_n_items(Object *o) {
867 assert(o->object.type == OBJECT_ENTRY_ARRAY);
869 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
872 static int link_entry_into_array(JournalFile *f,
877 uint64_t n = 0, ap = 0, q, i, a, hidx;
886 i = hidx = le64toh(*idx);
889 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
893 n = journal_file_entry_array_n_items(o);
895 o->entry_array.items[i] = htole64(p);
896 *idx = htole64(hidx + 1);
902 a = le64toh(o->entry_array.next_entry_array_offset);
913 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
914 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
919 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, q);
923 o->entry_array.items[i] = htole64(p);
928 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
932 o->entry_array.next_entry_array_offset = htole64(q);
935 *idx = htole64(hidx + 1);
940 static int link_entry_into_array_plus_one(JournalFile *f,
959 i = htole64(le64toh(*idx) - 1);
960 r = link_entry_into_array(f, first, &i, p);
965 *idx = htole64(le64toh(*idx) + 1);
969 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
976 p = le64toh(o->entry.items[i].object_offset);
980 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
984 return link_entry_into_array_plus_one(f,
985 &o->data.entry_offset,
986 &o->data.entry_array_offset,
991 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
998 assert(o->object.type == OBJECT_ENTRY);
1000 __sync_synchronize();
1002 /* Link up the entry itself */
1003 r = link_entry_into_array(f,
1004 &f->header->entry_array_offset,
1005 &f->header->n_entries,
1010 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
1012 if (f->header->head_entry_realtime == 0)
1013 f->header->head_entry_realtime = o->entry.realtime;
1015 f->header->tail_entry_realtime = o->entry.realtime;
1016 f->header->tail_entry_monotonic = o->entry.monotonic;
1018 f->tail_entry_monotonic_valid = true;
1020 /* Link up the items */
1021 n = journal_file_entry_n_items(o);
1022 for (i = 0; i < n; i++) {
1023 r = journal_file_link_entry_item(f, o, offset, i);
1031 static int journal_file_append_entry_internal(
1033 const dual_timestamp *ts,
1035 const EntryItem items[], unsigned n_items,
1037 Object **ret, uint64_t *offset) {
1044 assert(items || n_items == 0);
1047 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1049 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1053 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1054 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1055 o->entry.realtime = htole64(ts->realtime);
1056 o->entry.monotonic = htole64(ts->monotonic);
1057 o->entry.xor_hash = htole64(xor_hash);
1058 o->entry.boot_id = f->header->boot_id;
1060 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, np);
1064 r = journal_file_link_entry(f, o, np);
1077 void journal_file_post_change(JournalFile *f) {
1080 /* inotify() does not receive IN_MODIFY events from file
1081 * accesses done via mmap(). After each access we hence
1082 * trigger IN_MODIFY by truncating the journal file to its
1083 * current size which triggers IN_MODIFY. */
1085 __sync_synchronize();
1087 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1088 log_error("Failed to to truncate file to its own size: %m");
1091 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1095 uint64_t xor_hash = 0;
1096 struct dual_timestamp _ts;
1099 assert(iovec || n_iovec == 0);
1105 dual_timestamp_get(&_ts);
1109 if (f->tail_entry_monotonic_valid &&
1110 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1113 r = journal_file_maybe_append_tag(f, ts->realtime);
1117 /* alloca() can't take 0, hence let's allocate at least one */
1118 items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1120 for (i = 0; i < n_iovec; i++) {
1124 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1128 xor_hash ^= le64toh(o->data.hash);
1129 items[i].object_offset = htole64(p);
1130 items[i].hash = o->data.hash;
1133 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1135 journal_file_post_change(f);
1140 static int generic_array_get(JournalFile *f,
1143 Object **ret, uint64_t *offset) {
1155 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1159 n = journal_file_entry_array_n_items(o);
1161 p = le64toh(o->entry_array.items[i]);
1166 a = le64toh(o->entry_array.next_entry_array_offset);
1169 if (a <= 0 || p <= 0)
1172 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1185 static int generic_array_get_plus_one(JournalFile *f,
1189 Object **ret, uint64_t *offset) {
1198 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1211 return generic_array_get(f, first, i-1, ret, offset);
1220 static int generic_array_bisect(JournalFile *f,
1224 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1225 direction_t direction,
1230 uint64_t a, p, t = 0, i = 0, last_p = 0;
1231 bool subtract_one = false;
1232 Object *o, *array = NULL;
1236 assert(test_object);
1240 uint64_t left, right, k, lp;
1242 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1246 k = journal_file_entry_array_n_items(array);
1252 lp = p = le64toh(array->entry_array.items[i]);
1256 r = test_object(f, p, needle);
1260 if (r == TEST_FOUND)
1261 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1263 if (r == TEST_RIGHT) {
1267 if (left == right) {
1268 if (direction == DIRECTION_UP)
1269 subtract_one = true;
1275 assert(left < right);
1277 i = (left + right) / 2;
1278 p = le64toh(array->entry_array.items[i]);
1282 r = test_object(f, p, needle);
1286 if (r == TEST_FOUND)
1287 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1289 if (r == TEST_RIGHT)
1297 if (direction == DIRECTION_UP) {
1299 subtract_one = true;
1310 a = le64toh(array->entry_array.next_entry_array_offset);
1316 if (subtract_one && t == 0 && i == 0)
1319 if (subtract_one && i == 0)
1321 else if (subtract_one)
1322 p = le64toh(array->entry_array.items[i-1]);
1324 p = le64toh(array->entry_array.items[i]);
1326 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1337 *idx = t + i + (subtract_one ? -1 : 0);
1342 static int generic_array_bisect_plus_one(JournalFile *f,
1347 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1348 direction_t direction,
1354 bool step_back = false;
1358 assert(test_object);
1363 /* This bisects the array in object 'first', but first checks
1365 r = test_object(f, extra, needle);
1369 if (r == TEST_FOUND)
1370 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1372 /* if we are looking with DIRECTION_UP then we need to first
1373 see if in the actual array there is a matching entry, and
1374 return the last one of that. But if there isn't any we need
1375 to return this one. Hence remember this, and return it
1378 step_back = direction == DIRECTION_UP;
1380 if (r == TEST_RIGHT) {
1381 if (direction == DIRECTION_DOWN)
1387 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1389 if (r == 0 && step_back)
1398 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1414 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1420 else if (p < needle)
1426 int journal_file_move_to_entry_by_offset(
1429 direction_t direction,
1433 return generic_array_bisect(f,
1434 le64toh(f->header->entry_array_offset),
1435 le64toh(f->header->n_entries),
1443 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1450 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1454 if (le64toh(o->entry.seqnum) == needle)
1456 else if (le64toh(o->entry.seqnum) < needle)
1462 int journal_file_move_to_entry_by_seqnum(
1465 direction_t direction,
1469 return generic_array_bisect(f,
1470 le64toh(f->header->entry_array_offset),
1471 le64toh(f->header->n_entries),
1478 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1485 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1489 if (le64toh(o->entry.realtime) == needle)
1491 else if (le64toh(o->entry.realtime) < needle)
1497 int journal_file_move_to_entry_by_realtime(
1500 direction_t direction,
1504 return generic_array_bisect(f,
1505 le64toh(f->header->entry_array_offset),
1506 le64toh(f->header->n_entries),
1508 test_object_realtime,
1513 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1520 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1524 if (le64toh(o->entry.monotonic) == needle)
1526 else if (le64toh(o->entry.monotonic) < needle)
1532 int journal_file_move_to_entry_by_monotonic(
1536 direction_t direction,
1540 char t[9+32+1] = "_BOOT_ID=";
1546 sd_id128_to_string(boot_id, t + 9);
1547 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1553 return generic_array_bisect_plus_one(f,
1554 le64toh(o->data.entry_offset),
1555 le64toh(o->data.entry_array_offset),
1556 le64toh(o->data.n_entries),
1558 test_object_monotonic,
1563 int journal_file_next_entry(
1565 Object *o, uint64_t p,
1566 direction_t direction,
1567 Object **ret, uint64_t *offset) {
1573 assert(p > 0 || !o);
1575 n = le64toh(f->header->n_entries);
1580 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1582 if (o->object.type != OBJECT_ENTRY)
1585 r = generic_array_bisect(f,
1586 le64toh(f->header->entry_array_offset),
1587 le64toh(f->header->n_entries),
1596 if (direction == DIRECTION_DOWN) {
1609 /* And jump to it */
1610 return generic_array_get(f,
1611 le64toh(f->header->entry_array_offset),
1616 int journal_file_skip_entry(
1618 Object *o, uint64_t p,
1620 Object **ret, uint64_t *offset) {
1629 if (o->object.type != OBJECT_ENTRY)
1632 r = generic_array_bisect(f,
1633 le64toh(f->header->entry_array_offset),
1634 le64toh(f->header->n_entries),
1643 /* Calculate new index */
1645 if ((uint64_t) -skip >= i)
1648 i = i - (uint64_t) -skip;
1650 i += (uint64_t) skip;
1652 n = le64toh(f->header->n_entries);
1659 return generic_array_get(f,
1660 le64toh(f->header->entry_array_offset),
1665 int journal_file_next_entry_for_data(
1667 Object *o, uint64_t p,
1668 uint64_t data_offset,
1669 direction_t direction,
1670 Object **ret, uint64_t *offset) {
1677 assert(p > 0 || !o);
1679 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1683 n = le64toh(d->data.n_entries);
1688 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1690 if (o->object.type != OBJECT_ENTRY)
1693 r = generic_array_bisect_plus_one(f,
1694 le64toh(d->data.entry_offset),
1695 le64toh(d->data.entry_array_offset),
1696 le64toh(d->data.n_entries),
1706 if (direction == DIRECTION_DOWN) {
1720 return generic_array_get_plus_one(f,
1721 le64toh(d->data.entry_offset),
1722 le64toh(d->data.entry_array_offset),
1727 int journal_file_move_to_entry_by_offset_for_data(
1729 uint64_t data_offset,
1731 direction_t direction,
1732 Object **ret, uint64_t *offset) {
1739 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1743 return generic_array_bisect_plus_one(f,
1744 le64toh(d->data.entry_offset),
1745 le64toh(d->data.entry_array_offset),
1746 le64toh(d->data.n_entries),
1753 int journal_file_move_to_entry_by_monotonic_for_data(
1755 uint64_t data_offset,
1758 direction_t direction,
1759 Object **ret, uint64_t *offset) {
1761 char t[9+32+1] = "_BOOT_ID=";
1768 /* First, seek by time */
1769 sd_id128_to_string(boot_id, t + 9);
1770 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1776 r = generic_array_bisect_plus_one(f,
1777 le64toh(o->data.entry_offset),
1778 le64toh(o->data.entry_array_offset),
1779 le64toh(o->data.n_entries),
1781 test_object_monotonic,
1787 /* And now, continue seeking until we find an entry that
1788 * exists in both bisection arrays */
1794 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1798 r = generic_array_bisect_plus_one(f,
1799 le64toh(d->data.entry_offset),
1800 le64toh(d->data.entry_array_offset),
1801 le64toh(d->data.n_entries),
1809 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1813 r = generic_array_bisect_plus_one(f,
1814 le64toh(o->data.entry_offset),
1815 le64toh(o->data.entry_array_offset),
1816 le64toh(o->data.n_entries),
1840 int journal_file_move_to_entry_by_seqnum_for_data(
1842 uint64_t data_offset,
1844 direction_t direction,
1845 Object **ret, uint64_t *offset) {
1852 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1856 return generic_array_bisect_plus_one(f,
1857 le64toh(d->data.entry_offset),
1858 le64toh(d->data.entry_array_offset),
1859 le64toh(d->data.n_entries),
1866 int journal_file_move_to_entry_by_realtime_for_data(
1868 uint64_t data_offset,
1870 direction_t direction,
1871 Object **ret, uint64_t *offset) {
1878 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1882 return generic_array_bisect_plus_one(f,
1883 le64toh(d->data.entry_offset),
1884 le64toh(d->data.entry_array_offset),
1885 le64toh(d->data.n_entries),
1887 test_object_realtime,
1892 static void *fsprg_state(JournalFile *f) {
1896 if (!f->authenticate)
1899 a = le64toh(f->fsprg_header->header_size);
1900 b = le64toh(f->fsprg_header->state_size);
1902 if (a + b > f->fsprg_size)
1905 return (uint8_t*) f->fsprg_header + a;
1908 static uint64_t journal_file_tag_seqnum(JournalFile *f) {
1913 r = le64toh(f->header->n_tags) + 1;
1914 f->header->n_tags = htole64(r);
1919 int journal_file_append_tag(JournalFile *f) {
1926 if (!f->authenticate)
1929 if (!f->hmac_running)
1932 log_debug("Writing tag for epoch %llu\n", (unsigned long long) FSPRG_GetEpoch(fsprg_state(f)));
1936 r = journal_file_append_object(f, OBJECT_TAG, sizeof(struct TagObject), &o, &p);
1940 o->tag.seqnum = htole64(journal_file_tag_seqnum(f));
1942 /* Add the tag object itself, so that we can protect its
1943 * header. This will exclude the actual hash value in it */
1944 r = journal_file_hmac_put_object(f, OBJECT_TAG, p);
1948 /* Get the HMAC tag and store it in the object */
1949 memcpy(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH);
1950 f->hmac_running = false;
1955 static int journal_file_hmac_start(JournalFile *f) {
1956 uint8_t key[256 / 8]; /* Let's pass 256 bit from FSPRG to HMAC */
1960 if (!f->authenticate)
1963 if (f->hmac_running)
1966 /* Prepare HMAC for next cycle */
1967 gcry_md_reset(f->hmac);
1968 FSPRG_GetKey(fsprg_state(f), key, sizeof(key), 0);
1969 gcry_md_setkey(f->hmac, key, sizeof(key));
1971 f->hmac_running = true;
1976 static int journal_file_get_epoch(JournalFile *f, uint64_t realtime, uint64_t *epoch) {
1981 assert(f->authenticate);
1983 if (le64toh(f->fsprg_header->fsprg_start_usec) == 0 ||
1984 le64toh(f->fsprg_header->fsprg_interval_usec) == 0)
1987 if (realtime < le64toh(f->fsprg_header->fsprg_start_usec))
1990 t = realtime - le64toh(f->fsprg_header->fsprg_start_usec);
1991 t = t / le64toh(f->fsprg_header->fsprg_interval_usec);
1997 static int journal_file_need_evolve(JournalFile *f, uint64_t realtime) {
1998 uint64_t goal, epoch;
2002 if (!f->authenticate)
2005 r = journal_file_get_epoch(f, realtime, &goal);
2009 epoch = FSPRG_GetEpoch(fsprg_state(f));
2013 return epoch != goal;
2016 static int journal_file_evolve(JournalFile *f, uint64_t realtime) {
2017 uint64_t goal, epoch;
2022 if (!f->authenticate)
2025 r = journal_file_get_epoch(f, realtime, &goal);
2029 epoch = FSPRG_GetEpoch(fsprg_state(f));
2031 log_debug("Evolving FSPRG key from epoch %llu to %llu.", (unsigned long long) epoch, (unsigned long long) goal);
2039 FSPRG_Evolve(fsprg_state(f));
2040 epoch = FSPRG_GetEpoch(fsprg_state(f));
2044 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime) {
2049 if (!f->authenticate)
2052 r = journal_file_need_evolve(f, realtime);
2056 r = journal_file_append_tag(f);
2060 r = journal_file_evolve(f, realtime);
2064 r = journal_file_hmac_start(f);
2071 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p) {
2077 if (!f->authenticate)
2080 r = journal_file_hmac_start(f);
2084 r = journal_file_move_to_object(f, type, p, &o);
2088 gcry_md_write(f->hmac, o, offsetof(ObjectHeader, payload));
2090 switch (o->object.type) {
2093 /* All but: hash and payload are mutable */
2094 gcry_md_write(f->hmac, &o->data.hash, sizeof(o->data.hash));
2095 gcry_md_write(f->hmac, o->data.payload, le64toh(o->object.size) - offsetof(DataObject, payload));
2100 gcry_md_write(f->hmac, &o->entry.seqnum, le64toh(o->object.size) - offsetof(EntryObject, seqnum));
2103 case OBJECT_FIELD_HASH_TABLE:
2104 case OBJECT_DATA_HASH_TABLE:
2105 case OBJECT_ENTRY_ARRAY:
2106 /* Nothing: everything is mutable */
2110 /* All but the tag itself */
2111 gcry_md_write(f->hmac, &o->tag.seqnum, sizeof(o->tag.seqnum));
2120 static int journal_file_hmac_put_header(JournalFile *f) {
2125 if (!f->authenticate)
2128 r = journal_file_hmac_start(f);
2132 /* All but state+reserved, boot_id, arena_size,
2133 * tail_object_offset, n_objects, n_entries, tail_seqnum,
2134 * head_entry_realtime, tail_entry_realtime,
2135 * tail_entry_monotonic, n_data, n_fields, header_tag */
2137 gcry_md_write(f->hmac, f->header->signature, offsetof(Header, state) - offsetof(Header, signature));
2138 gcry_md_write(f->hmac, &f->header->file_id, offsetof(Header, boot_id) - offsetof(Header, file_id));
2139 gcry_md_write(f->hmac, &f->header->seqnum_id, offsetof(Header, arena_size) - offsetof(Header, seqnum_id));
2140 gcry_md_write(f->hmac, &f->header->data_hash_table_offset, offsetof(Header, tail_object_offset) - offsetof(Header, data_hash_table_offset));
2141 gcry_md_write(f->hmac, &f->header->head_seqnum, offsetof(Header, head_entry_realtime) - offsetof(Header, head_seqnum));
2146 static int journal_file_load_fsprg(JournalFile *f) {
2150 FSPRGHeader *m = NULL;
2155 if (!f->authenticate)
2158 r = sd_id128_get_machine(&machine);
2162 if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/fsprg",
2163 SD_ID128_FORMAT_VAL(machine)) < 0)
2166 fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY, 0600);
2168 log_error("Failed to open %s: %m", p);
2173 if (fstat(fd, &st) < 0) {
2178 if (st.st_size < (off_t) sizeof(FSPRGHeader)) {
2183 m = mmap(NULL, PAGE_ALIGN(sizeof(FSPRGHeader)), PROT_READ, MAP_SHARED, fd, 0);
2184 if (m == MAP_FAILED) {
2190 if (memcmp(m->signature, FSPRG_HEADER_SIGNATURE, 8) != 0) {
2195 if (m->incompatible_flags != 0) {
2196 r = -EPROTONOSUPPORT;
2200 if (le64toh(m->header_size) < sizeof(FSPRGHeader)) {
2205 if (le64toh(m->state_size) != FSPRG_stateinbytes(m->secpar)) {
2210 f->fsprg_size = le64toh(m->header_size) + le64toh(m->state_size);
2211 if ((uint64_t) st.st_size < f->fsprg_size) {
2216 if (!sd_id128_equal(machine, m->machine_id)) {
2221 if (le64toh(m->fsprg_start_usec) <= 0 ||
2222 le64toh(m->fsprg_interval_usec) <= 0) {
2227 f->fsprg_header = mmap(NULL, PAGE_ALIGN(f->fsprg_size), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2228 if (f->fsprg_header == MAP_FAILED) {
2229 f->fsprg_header = NULL;
2238 munmap(m, PAGE_ALIGN(sizeof(FSPRGHeader)));
2241 close_nointr_nofail(fd);
2247 static int journal_file_setup_hmac(JournalFile *f) {
2250 if (!f->authenticate)
2253 e = gcry_md_open(&f->hmac, GCRY_MD_SHA256, GCRY_MD_FLAG_HMAC);
2260 static int journal_file_append_first_tag(JournalFile *f) {
2264 if (!f->authenticate)
2267 log_debug("Calculating first tag...");
2269 r = journal_file_hmac_put_header(f);
2273 p = le64toh(f->header->field_hash_table_offset);
2274 if (p < offsetof(Object, hash_table.items))
2276 p -= offsetof(Object, hash_table.items);
2278 r = journal_file_hmac_put_object(f, OBJECT_FIELD_HASH_TABLE, p);
2282 p = le64toh(f->header->data_hash_table_offset);
2283 if (p < offsetof(Object, hash_table.items))
2285 p -= offsetof(Object, hash_table.items);
2287 r = journal_file_hmac_put_object(f, OBJECT_DATA_HASH_TABLE, p);
2291 r = journal_file_append_tag(f);
2298 void journal_file_dump(JournalFile *f) {
2305 journal_file_print_header(f);
2307 p = le64toh(f->header->header_size);
2309 r = journal_file_move_to_object(f, -1, p, &o);
2313 switch (o->object.type) {
2316 printf("Type: OBJECT_UNUSED\n");
2320 printf("Type: OBJECT_DATA\n");
2324 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
2325 (unsigned long long) le64toh(o->entry.seqnum),
2326 (unsigned long long) le64toh(o->entry.monotonic),
2327 (unsigned long long) le64toh(o->entry.realtime));
2330 case OBJECT_FIELD_HASH_TABLE:
2331 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2334 case OBJECT_DATA_HASH_TABLE:
2335 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2338 case OBJECT_ENTRY_ARRAY:
2339 printf("Type: OBJECT_ENTRY_ARRAY\n");
2343 printf("Type: OBJECT_TAG %llu\n",
2344 (unsigned long long) le64toh(o->tag.seqnum));
2348 if (o->object.flags & OBJECT_COMPRESSED)
2349 printf("Flags: COMPRESSED\n");
2351 if (p == le64toh(f->header->tail_object_offset))
2354 p = p + ALIGN64(le64toh(o->object.size));
2359 log_error("File corrupt");
2362 void journal_file_print_header(JournalFile *f) {
2363 char a[33], b[33], c[33];
2364 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2368 printf("File Path: %s\n"
2372 "Sequential Number ID: %s\n"
2374 "Compatible Flags:%s%s\n"
2375 "Incompatible Flags:%s%s\n"
2376 "Header size: %llu\n"
2377 "Arena size: %llu\n"
2378 "Data Hash Table Size: %llu\n"
2379 "Field Hash Table Size: %llu\n"
2381 "Entry Objects: %llu\n"
2382 "Rotate Suggested: %s\n"
2383 "Head Sequential Number: %llu\n"
2384 "Tail Sequential Number: %llu\n"
2385 "Head Realtime Timestamp: %s\n"
2386 "Tail Realtime Timestamp: %s\n",
2388 sd_id128_to_string(f->header->file_id, a),
2389 sd_id128_to_string(f->header->machine_id, b),
2390 sd_id128_to_string(f->header->boot_id, c),
2391 sd_id128_to_string(f->header->seqnum_id, c),
2392 f->header->state == STATE_OFFLINE ? "offline" :
2393 f->header->state == STATE_ONLINE ? "online" :
2394 f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
2395 (f->header->compatible_flags & HEADER_COMPATIBLE_AUTHENTICATED) ? " AUTHENTICATED" : "",
2396 (f->header->compatible_flags & ~HEADER_COMPATIBLE_AUTHENTICATED) ? " ???" : "",
2397 (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
2398 (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2399 (unsigned long long) le64toh(f->header->header_size),
2400 (unsigned long long) le64toh(f->header->arena_size),
2401 (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2402 (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2403 (unsigned long long) le64toh(f->header->n_objects),
2404 (unsigned long long) le64toh(f->header->n_entries),
2405 yes_no(journal_file_rotate_suggested(f)),
2406 (unsigned long long) le64toh(f->header->head_seqnum),
2407 (unsigned long long) le64toh(f->header->tail_seqnum),
2408 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2409 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
2411 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2412 printf("Data Objects: %llu\n"
2413 "Data Hash Table Fill: %.1f%%\n",
2414 (unsigned long long) le64toh(f->header->n_data),
2415 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2417 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2418 printf("Field Objects: %llu\n"
2419 "Field Hash Table Fill: %.1f%%\n",
2420 (unsigned long long) le64toh(f->header->n_fields),
2421 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2424 int journal_file_open(
2430 JournalMetrics *metrics,
2431 JournalFile *template,
2432 JournalFile **ret) {
2436 bool newly_created = false;
2440 if ((flags & O_ACCMODE) != O_RDONLY &&
2441 (flags & O_ACCMODE) != O_RDWR)
2444 if (!endswith(fname, ".journal"))
2447 f = new0(JournalFile, 1);
2455 f->prot = prot_from_flags(flags);
2456 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2457 f->compress = compress;
2458 f->authenticate = authenticate;
2460 f->path = strdup(fname);
2466 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2472 if (fstat(f->fd, &f->last_stat) < 0) {
2477 if (f->last_stat.st_size == 0 && f->writable) {
2478 newly_created = true;
2480 /* Try to load the FSPRG state, and if we can't, then
2481 * just don't do authentication */
2482 r = journal_file_load_fsprg(f);
2484 f->authenticate = false;
2486 r = journal_file_init_header(f, template);
2490 if (fstat(f->fd, &f->last_stat) < 0) {
2496 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2501 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2502 if (f->header == MAP_FAILED) {
2508 if (!newly_created) {
2509 r = journal_file_verify_header(f);
2514 if (!newly_created && f->writable) {
2515 r = journal_file_load_fsprg(f);
2522 journal_default_metrics(metrics, f->fd);
2523 f->metrics = *metrics;
2524 } else if (template)
2525 f->metrics = template->metrics;
2527 r = journal_file_refresh_header(f);
2531 r = journal_file_setup_hmac(f);
2536 if (newly_created) {
2537 r = journal_file_setup_field_hash_table(f);
2541 r = journal_file_setup_data_hash_table(f);
2545 r = journal_file_append_first_tag(f);
2550 r = journal_file_map_field_hash_table(f);
2554 r = journal_file_map_data_hash_table(f);
2564 journal_file_close(f);
2569 int journal_file_rotate(JournalFile **f, bool compress, bool authenticate) {
2572 JournalFile *old_file, *new_file = NULL;
2580 if (!old_file->writable)
2583 if (!endswith(old_file->path, ".journal"))
2586 l = strlen(old_file->path);
2588 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2592 memcpy(p, old_file->path, l - 8);
2594 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2595 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2596 "-%016llx-%016llx.journal",
2597 (unsigned long long) le64toh((*f)->header->tail_seqnum),
2598 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2600 r = rename(old_file->path, p);
2606 old_file->header->state = STATE_ARCHIVED;
2608 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, authenticate, NULL, old_file, &new_file);
2609 journal_file_close(old_file);
2615 int journal_file_open_reliably(
2621 JournalMetrics *metrics,
2622 JournalFile *template,
2623 JournalFile **ret) {
2629 r = journal_file_open(fname, flags, mode, compress, authenticate, metrics, template, ret);
2630 if (r != -EBADMSG && /* corrupted */
2631 r != -ENODATA && /* truncated */
2632 r != -EHOSTDOWN && /* other machine */
2633 r != -EPROTONOSUPPORT && /* incompatible feature */
2634 r != -EBUSY && /* unclean shutdown */
2635 r != -ESHUTDOWN /* already archived */)
2638 if ((flags & O_ACCMODE) == O_RDONLY)
2641 if (!(flags & O_CREAT))
2644 if (!endswith(fname, ".journal"))
2647 /* The file is corrupted. Rotate it away and try it again (but only once) */
2650 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2652 (unsigned long long) now(CLOCK_REALTIME),
2656 r = rename(fname, p);
2661 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2663 return journal_file_open(fname, flags, mode, compress, authenticate, metrics, template, ret);
2666 struct vacuum_info {
2671 sd_id128_t seqnum_id;
2677 static int vacuum_compare(const void *_a, const void *_b) {
2678 const struct vacuum_info *a, *b;
2683 if (a->have_seqnum && b->have_seqnum &&
2684 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2685 if (a->seqnum < b->seqnum)
2687 else if (a->seqnum > b->seqnum)
2693 if (a->realtime < b->realtime)
2695 else if (a->realtime > b->realtime)
2697 else if (a->have_seqnum && b->have_seqnum)
2698 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2700 return strcmp(a->filename, b->filename);
2703 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2706 struct vacuum_info *list = NULL;
2707 unsigned n_list = 0, n_allocated = 0, i;
2715 d = opendir(directory);
2721 struct dirent buf, *de;
2725 unsigned long long seqnum = 0, realtime;
2726 sd_id128_t seqnum_id;
2729 k = readdir_r(d, &buf, &de);
2738 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2741 if (!S_ISREG(st.st_mode))
2744 q = strlen(de->d_name);
2746 if (endswith(de->d_name, ".journal")) {
2748 /* Vacuum archived files */
2750 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2753 if (de->d_name[q-8-16-1] != '-' ||
2754 de->d_name[q-8-16-1-16-1] != '-' ||
2755 de->d_name[q-8-16-1-16-1-32-1] != '@')
2758 p = strdup(de->d_name);
2764 de->d_name[q-8-16-1-16-1] = 0;
2765 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2770 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2777 } else if (endswith(de->d_name, ".journal~")) {
2778 unsigned long long tmp;
2780 /* Vacuum corrupted files */
2782 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2785 if (de->d_name[q-1-8-16-1] != '-' ||
2786 de->d_name[q-1-8-16-1-16-1] != '@')
2789 p = strdup(de->d_name);
2795 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2800 have_seqnum = false;
2804 if (n_list >= n_allocated) {
2805 struct vacuum_info *j;
2807 n_allocated = MAX(n_allocated * 2U, 8U);
2808 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2818 list[n_list].filename = p;
2819 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2820 list[n_list].seqnum = seqnum;
2821 list[n_list].realtime = realtime;
2822 list[n_list].seqnum_id = seqnum_id;
2823 list[n_list].have_seqnum = have_seqnum;
2825 sum += list[n_list].usage;
2831 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2833 for(i = 0; i < n_list; i++) {
2836 if (fstatvfs(dirfd(d), &ss) < 0) {
2841 if (sum <= max_use &&
2842 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2845 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2846 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2847 sum -= list[i].usage;
2848 } else if (errno != ENOENT)
2849 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2853 for (i = 0; i < n_list; i++)
2854 free(list[i].filename);
2864 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2866 uint64_t q, xor_hash = 0;
2879 ts.monotonic = le64toh(o->entry.monotonic);
2880 ts.realtime = le64toh(o->entry.realtime);
2882 if (to->tail_entry_monotonic_valid &&
2883 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2886 n = journal_file_entry_n_items(o);
2887 items = alloca(sizeof(EntryItem) * n);
2889 for (i = 0; i < n; i++) {
2896 q = le64toh(o->entry.items[i].object_offset);
2897 le_hash = o->entry.items[i].hash;
2899 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2903 if (le_hash != o->data.hash)
2906 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2909 /* We hit the limit on 32bit machines */
2910 if ((uint64_t) t != l)
2913 if (o->object.flags & OBJECT_COMPRESSED) {
2917 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2920 data = from->compress_buffer;
2923 return -EPROTONOSUPPORT;
2926 data = o->data.payload;
2928 r = journal_file_append_data(to, data, l, &u, &h);
2932 xor_hash ^= le64toh(u->data.hash);
2933 items[i].object_offset = htole64(h);
2934 items[i].hash = u->data.hash;
2936 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2941 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2944 void journal_default_metrics(JournalMetrics *m, int fd) {
2945 uint64_t fs_size = 0;
2947 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2952 if (fstatvfs(fd, &ss) >= 0)
2953 fs_size = ss.f_frsize * ss.f_blocks;
2955 if (m->max_use == (uint64_t) -1) {
2958 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2960 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2961 m->max_use = DEFAULT_MAX_USE_UPPER;
2963 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2964 m->max_use = DEFAULT_MAX_USE_LOWER;
2966 m->max_use = DEFAULT_MAX_USE_LOWER;
2968 m->max_use = PAGE_ALIGN(m->max_use);
2970 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2971 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2974 if (m->max_size == (uint64_t) -1) {
2975 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2977 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2978 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2980 m->max_size = PAGE_ALIGN(m->max_size);
2982 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2983 m->max_size = JOURNAL_FILE_SIZE_MIN;
2985 if (m->max_size*2 > m->max_use)
2986 m->max_use = m->max_size*2;
2988 if (m->min_size == (uint64_t) -1)
2989 m->min_size = JOURNAL_FILE_SIZE_MIN;
2991 m->min_size = PAGE_ALIGN(m->min_size);
2993 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2994 m->min_size = JOURNAL_FILE_SIZE_MIN;
2996 if (m->min_size > m->max_size)
2997 m->max_size = m->min_size;
3000 if (m->keep_free == (uint64_t) -1) {
3003 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
3005 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3006 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3009 m->keep_free = DEFAULT_KEEP_FREE;
3012 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
3013 format_bytes(a, sizeof(a), m->max_use),
3014 format_bytes(b, sizeof(b), m->max_size),
3015 format_bytes(c, sizeof(c), m->min_size),
3016 format_bytes(d, sizeof(d), m->keep_free));
3019 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3024 if (f->header->head_entry_realtime == 0)
3027 *from = le64toh(f->header->head_entry_realtime);
3031 if (f->header->tail_entry_realtime == 0)
3034 *to = le64toh(f->header->tail_entry_realtime);
3040 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3041 char t[9+32+1] = "_BOOT_ID=";
3049 sd_id128_to_string(boot_id, t + 9);
3051 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
3055 if (le64toh(o->data.n_entries) <= 0)
3059 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3063 *from = le64toh(o->entry.monotonic);
3067 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3071 r = generic_array_get_plus_one(f,
3072 le64toh(o->data.entry_offset),
3073 le64toh(o->data.entry_array_offset),
3074 le64toh(o->data.n_entries)-1,
3079 *to = le64toh(o->entry.monotonic);
3085 bool journal_file_rotate_suggested(JournalFile *f) {
3088 /* If we gained new header fields we gained new features,
3089 * hence suggest a rotation */
3090 if (le64toh(f->header->header_size) < sizeof(Header)) {
3091 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3095 /* Let's check if the hash tables grew over a certain fill
3096 * level (75%, borrowing this value from Java's hash table
3097 * implementation), and if so suggest a rotation. To calculate
3098 * the fill level we need the n_data field, which only exists
3099 * in newer versions. */
3101 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3102 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3103 log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
3105 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3106 (unsigned long long) le64toh(f->header->n_data),
3107 (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
3108 (unsigned long long) (f->last_stat.st_size),
3109 (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
3113 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3114 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3115 log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
3117 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3118 (unsigned long long) le64toh(f->header->n_fields),
3119 (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));