1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
36 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
39 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
67 #define JOURNAL_HEADER_CONTAINS(h, field) \
68 (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
70 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime);
72 void journal_file_close(JournalFile *f) {
77 /* Sync everything to disk, before we mark the file offline */
78 for (t = 0; t < _WINDOW_MAX; t++)
79 if (f->windows[t].ptr)
80 munmap(f->windows[t].ptr, f->windows[t].size);
82 if (f->writable && f->fd >= 0)
86 /* Mark the file offline. Don't override the archived state if it already is set */
87 if (f->writable && f->header->state == STATE_ONLINE)
88 f->header->state = STATE_OFFLINE;
90 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
94 close_nointr_nofail(f->fd);
99 free(f->compress_buffer);
104 munmap(f->fsprg_header, PAGE_ALIGN(f->fsprg_size));
107 gcry_md_close(f->hmac);
113 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
121 memcpy(h.signature, HEADER_SIGNATURE, 8);
122 h.header_size = htole64(ALIGN64(sizeof(h)));
124 h.incompatible_flags =
125 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
128 htole32(f->authenticate ? HEADER_COMPATIBLE_AUTHENTICATED : 0);
130 r = sd_id128_randomize(&h.file_id);
135 h.seqnum_id = template->header->seqnum_id;
136 h.tail_seqnum = template->header->tail_seqnum;
138 h.seqnum_id = h.file_id;
140 k = pwrite(f->fd, &h, sizeof(h), 0);
150 static int journal_file_refresh_header(JournalFile *f) {
156 r = sd_id128_get_machine(&f->header->machine_id);
160 r = sd_id128_get_boot(&boot_id);
164 if (sd_id128_equal(boot_id, f->header->boot_id))
165 f->tail_entry_monotonic_valid = true;
167 f->header->boot_id = boot_id;
169 f->header->state = STATE_ONLINE;
171 /* Sync the online state to disk */
172 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
178 static int journal_file_verify_header(JournalFile *f) {
181 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
184 /* In both read and write mode we refuse to open files with
185 * incompatible flags we don't know */
187 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
188 return -EPROTONOSUPPORT;
190 if (f->header->incompatible_flags != 0)
191 return -EPROTONOSUPPORT;
194 /* When open for writing we refuse to open files with
195 * compatible flags, too */
198 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_AUTHENTICATED) != 0)
199 return -EPROTONOSUPPORT;
201 if (f->header->compatible_flags != 0)
202 return -EPROTONOSUPPORT;
206 /* The first addition was n_data, so check that we are at least this large */
207 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
210 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
215 sd_id128_t machine_id;
218 r = sd_id128_get_machine(&machine_id);
222 if (!sd_id128_equal(machine_id, f->header->machine_id))
225 state = f->header->state;
227 if (state == STATE_ONLINE) {
228 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
230 } else if (state == STATE_ARCHIVED)
232 else if (state != STATE_OFFLINE) {
233 log_debug("Journal file %s has unknown state %u.", f->path, state);
238 f->compress = !!(le32toh(f->header->incompatible_flags) & HEADER_INCOMPATIBLE_COMPRESSED);
239 f->authenticate = !!(le32toh(f->header->compatible_flags) & HEADER_COMPATIBLE_AUTHENTICATED);
244 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
245 uint64_t old_size, new_size;
250 /* We assume that this file is not sparse, and we know that
251 * for sure, since we always call posix_fallocate()
255 le64toh(f->header->header_size) +
256 le64toh(f->header->arena_size);
258 new_size = PAGE_ALIGN(offset + size);
259 if (new_size < le64toh(f->header->header_size))
260 new_size = le64toh(f->header->header_size);
262 if (new_size <= old_size)
265 if (f->metrics.max_size > 0 &&
266 new_size > f->metrics.max_size)
269 if (new_size > f->metrics.min_size &&
270 f->metrics.keep_free > 0) {
273 if (fstatvfs(f->fd, &svfs) >= 0) {
276 available = svfs.f_bfree * svfs.f_bsize;
278 if (available >= f->metrics.keep_free)
279 available -= f->metrics.keep_free;
283 if (new_size - old_size > available)
288 /* Note that the glibc fallocate() fallback is very
289 inefficient, hence we try to minimize the allocation area
291 r = posix_fallocate(f->fd, old_size, new_size - old_size);
295 if (fstat(f->fd, &f->last_stat) < 0)
298 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
303 static int journal_file_map(
312 uint64_t woffset, wsize;
319 woffset = offset & ~((uint64_t) page_size() - 1ULL);
320 wsize = size + (offset - woffset);
321 wsize = PAGE_ALIGN(wsize);
323 /* Avoid SIGBUS on invalid accesses */
324 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
325 return -EADDRNOTAVAIL;
327 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
328 if (window == MAP_FAILED)
340 *ret = (uint8_t*) window + (offset - woffset);
345 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
354 assert(wt < _WINDOW_MAX);
356 if (offset + size > (uint64_t) f->last_stat.st_size) {
357 /* Hmm, out of range? Let's refresh the fstat() data
358 * first, before we trust that check. */
360 if (fstat(f->fd, &f->last_stat) < 0 ||
361 offset + size > (uint64_t) f->last_stat.st_size)
362 return -EADDRNOTAVAIL;
367 if (_likely_(w->ptr &&
368 w->offset <= offset &&
369 w->offset + w->size >= offset + size)) {
371 *ret = (uint8_t*) w->ptr + (offset - w->offset);
376 if (munmap(w->ptr, w->size) < 0)
380 w->size = w->offset = 0;
383 if (size < DEFAULT_WINDOW_SIZE) {
384 /* If the default window size is larger then what was
385 * asked for extend the mapping a bit in the hope to
386 * minimize needed remappings later on. We add half
387 * the window space before and half behind the
388 * requested mapping */
390 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
396 size = DEFAULT_WINDOW_SIZE;
400 if (offset + size > (uint64_t) f->last_stat.st_size)
401 size = (uint64_t) f->last_stat.st_size - offset;
404 return -EADDRNOTAVAIL;
406 r = journal_file_map(f,
408 &w->ptr, &w->offset, &w->size,
414 *ret = (uint8_t*) p + delta;
418 static bool verify_hash(Object *o) {
423 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
424 h1 = le64toh(o->data.hash);
425 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
426 } else if (o->object.type == OBJECT_FIELD) {
427 h1 = le64toh(o->field.hash);
428 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
435 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
443 assert(type < _OBJECT_TYPE_MAX);
445 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
450 s = le64toh(o->object.size);
452 if (s < sizeof(ObjectHeader))
455 if (type >= 0 && o->object.type != type)
458 if (s > sizeof(ObjectHeader)) {
459 r = journal_file_move_to(f, o->object.type, offset, s, &t);
473 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
478 r = le64toh(f->header->tail_seqnum) + 1;
481 /* If an external seqnum counter was passed, we update
482 * both the local and the external one, and set it to
483 * the maximum of both */
491 f->header->tail_seqnum = htole64(r);
493 if (f->header->head_seqnum == 0)
494 f->header->head_seqnum = htole64(r);
499 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
506 assert(size >= sizeof(ObjectHeader));
510 p = le64toh(f->header->tail_object_offset);
512 p = le64toh(f->header->header_size);
514 r = journal_file_move_to_object(f, -1, p, &tail);
518 p += ALIGN64(le64toh(tail->object.size));
521 r = journal_file_allocate(f, p, size);
525 r = journal_file_move_to(f, type, p, size, &t);
532 o->object.type = type;
533 o->object.size = htole64(size);
535 f->header->tail_object_offset = htole64(p);
536 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
544 static int journal_file_setup_data_hash_table(JournalFile *f) {
551 /* We estimate that we need 1 hash table entry per 768 of
552 journal file and we want to make sure we never get beyond
553 75% fill level. Calculate the hash table size for the
554 maximum file size based on these metrics. */
556 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
557 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
558 s = DEFAULT_DATA_HASH_TABLE_SIZE;
560 log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
562 r = journal_file_append_object(f,
563 OBJECT_DATA_HASH_TABLE,
564 offsetof(Object, hash_table.items) + s,
569 memset(o->hash_table.items, 0, s);
571 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
572 f->header->data_hash_table_size = htole64(s);
577 static int journal_file_setup_field_hash_table(JournalFile *f) {
584 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585 r = journal_file_append_object(f,
586 OBJECT_FIELD_HASH_TABLE,
587 offsetof(Object, hash_table.items) + s,
592 memset(o->hash_table.items, 0, s);
594 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595 f->header->field_hash_table_size = htole64(s);
600 static int journal_file_map_data_hash_table(JournalFile *f) {
607 p = le64toh(f->header->data_hash_table_offset);
608 s = le64toh(f->header->data_hash_table_size);
610 r = journal_file_move_to(f,
611 WINDOW_DATA_HASH_TABLE,
617 f->data_hash_table = t;
621 static int journal_file_map_field_hash_table(JournalFile *f) {
628 p = le64toh(f->header->field_hash_table_offset);
629 s = le64toh(f->header->field_hash_table_size);
631 r = journal_file_move_to(f,
632 WINDOW_FIELD_HASH_TABLE,
638 f->field_hash_table = t;
642 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
649 assert(o->object.type == OBJECT_DATA);
651 /* This might alter the window we are looking at */
653 o->data.next_hash_offset = o->data.next_field_offset = 0;
654 o->data.entry_offset = o->data.entry_array_offset = 0;
655 o->data.n_entries = 0;
657 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
658 p = le64toh(f->data_hash_table[h].tail_hash_offset);
660 /* Only entry in the hash table is easy */
661 f->data_hash_table[h].head_hash_offset = htole64(offset);
663 /* Move back to the previous data object, to patch in
666 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
670 o->data.next_hash_offset = htole64(offset);
673 f->data_hash_table[h].tail_hash_offset = htole64(offset);
675 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
676 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
681 int journal_file_find_data_object_with_hash(
683 const void *data, uint64_t size, uint64_t hash,
684 Object **ret, uint64_t *offset) {
686 uint64_t p, osize, h;
690 assert(data || size == 0);
692 osize = offsetof(Object, data.payload) + size;
694 if (f->header->data_hash_table_size == 0)
697 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
698 p = le64toh(f->data_hash_table[h].head_hash_offset);
703 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
707 if (le64toh(o->data.hash) != hash)
710 if (o->object.flags & OBJECT_COMPRESSED) {
714 l = le64toh(o->object.size);
715 if (l <= offsetof(Object, data.payload))
718 l -= offsetof(Object, data.payload);
720 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
724 memcmp(f->compress_buffer, data, size) == 0) {
735 return -EPROTONOSUPPORT;
738 } else if (le64toh(o->object.size) == osize &&
739 memcmp(o->data.payload, data, size) == 0) {
751 p = le64toh(o->data.next_hash_offset);
757 int journal_file_find_data_object(
759 const void *data, uint64_t size,
760 Object **ret, uint64_t *offset) {
765 assert(data || size == 0);
767 hash = hash64(data, size);
769 return journal_file_find_data_object_with_hash(f,
774 static int journal_file_append_data(
776 const void *data, uint64_t size,
777 Object **ret, uint64_t *offset) {
783 bool compressed = false;
786 assert(data || size == 0);
788 hash = hash64(data, size);
790 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
804 osize = offsetof(Object, data.payload) + size;
805 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
809 o->data.hash = htole64(hash);
813 size >= COMPRESSION_SIZE_THRESHOLD) {
816 compressed = compress_blob(data, size, o->data.payload, &rsize);
819 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
820 o->object.flags |= OBJECT_COMPRESSED;
822 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
827 if (!compressed && size > 0)
828 memcpy(o->data.payload, data, size);
830 r = journal_file_link_data(f, o, p, hash);
834 /* The linking might have altered the window, so let's
835 * refresh our pointer */
836 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
849 uint64_t journal_file_entry_n_items(Object *o) {
851 assert(o->object.type == OBJECT_ENTRY);
853 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
856 static uint64_t journal_file_entry_array_n_items(Object *o) {
858 assert(o->object.type == OBJECT_ENTRY_ARRAY);
860 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
863 static int link_entry_into_array(JournalFile *f,
868 uint64_t n = 0, ap = 0, q, i, a, hidx;
877 i = hidx = le64toh(*idx);
880 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
884 n = journal_file_entry_array_n_items(o);
886 o->entry_array.items[i] = htole64(p);
887 *idx = htole64(hidx + 1);
893 a = le64toh(o->entry_array.next_entry_array_offset);
904 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
905 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
910 o->entry_array.items[i] = htole64(p);
915 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
919 o->entry_array.next_entry_array_offset = htole64(q);
922 *idx = htole64(hidx + 1);
927 static int link_entry_into_array_plus_one(JournalFile *f,
946 i = htole64(le64toh(*idx) - 1);
947 r = link_entry_into_array(f, first, &i, p);
952 *idx = htole64(le64toh(*idx) + 1);
956 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
963 p = le64toh(o->entry.items[i].object_offset);
967 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
971 return link_entry_into_array_plus_one(f,
972 &o->data.entry_offset,
973 &o->data.entry_array_offset,
978 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
985 assert(o->object.type == OBJECT_ENTRY);
987 __sync_synchronize();
989 /* Link up the entry itself */
990 r = link_entry_into_array(f,
991 &f->header->entry_array_offset,
992 &f->header->n_entries,
997 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
999 if (f->header->head_entry_realtime == 0)
1000 f->header->head_entry_realtime = o->entry.realtime;
1002 f->header->tail_entry_realtime = o->entry.realtime;
1003 f->header->tail_entry_monotonic = o->entry.monotonic;
1005 f->tail_entry_monotonic_valid = true;
1007 /* Link up the items */
1008 n = journal_file_entry_n_items(o);
1009 for (i = 0; i < n; i++) {
1010 r = journal_file_link_entry_item(f, o, offset, i);
1018 static int journal_file_append_entry_internal(
1020 const dual_timestamp *ts,
1022 const EntryItem items[], unsigned n_items,
1024 Object **ret, uint64_t *offset) {
1031 assert(items || n_items == 0);
1034 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1036 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1040 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
1041 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1042 o->entry.realtime = htole64(ts->realtime);
1043 o->entry.monotonic = htole64(ts->monotonic);
1044 o->entry.xor_hash = htole64(xor_hash);
1045 o->entry.boot_id = f->header->boot_id;
1047 r = journal_file_link_entry(f, o, np);
1060 void journal_file_post_change(JournalFile *f) {
1063 /* inotify() does not receive IN_MODIFY events from file
1064 * accesses done via mmap(). After each access we hence
1065 * trigger IN_MODIFY by truncating the journal file to its
1066 * current size which triggers IN_MODIFY. */
1068 __sync_synchronize();
1070 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1071 log_error("Failed to to truncate file to its own size: %m");
1074 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1078 uint64_t xor_hash = 0;
1079 struct dual_timestamp _ts;
1082 assert(iovec || n_iovec == 0);
1088 dual_timestamp_get(&_ts);
1092 if (f->tail_entry_monotonic_valid &&
1093 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1096 r = journal_file_maybe_append_tag(f, ts->realtime);
1100 /* alloca() can't take 0, hence let's allocate at least one */
1101 items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1103 for (i = 0; i < n_iovec; i++) {
1107 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1111 xor_hash ^= le64toh(o->data.hash);
1112 items[i].object_offset = htole64(p);
1113 items[i].hash = o->data.hash;
1116 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1118 journal_file_post_change(f);
1123 static int generic_array_get(JournalFile *f,
1126 Object **ret, uint64_t *offset) {
1138 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1142 n = journal_file_entry_array_n_items(o);
1144 p = le64toh(o->entry_array.items[i]);
1149 a = le64toh(o->entry_array.next_entry_array_offset);
1152 if (a <= 0 || p <= 0)
1155 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1168 static int generic_array_get_plus_one(JournalFile *f,
1172 Object **ret, uint64_t *offset) {
1181 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1194 return generic_array_get(f, first, i-1, ret, offset);
1203 static int generic_array_bisect(JournalFile *f,
1207 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1208 direction_t direction,
1213 uint64_t a, p, t = 0, i = 0, last_p = 0;
1214 bool subtract_one = false;
1215 Object *o, *array = NULL;
1219 assert(test_object);
1223 uint64_t left, right, k, lp;
1225 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1229 k = journal_file_entry_array_n_items(array);
1235 lp = p = le64toh(array->entry_array.items[i]);
1239 r = test_object(f, p, needle);
1243 if (r == TEST_FOUND)
1244 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1246 if (r == TEST_RIGHT) {
1250 if (left == right) {
1251 if (direction == DIRECTION_UP)
1252 subtract_one = true;
1258 assert(left < right);
1260 i = (left + right) / 2;
1261 p = le64toh(array->entry_array.items[i]);
1265 r = test_object(f, p, needle);
1269 if (r == TEST_FOUND)
1270 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1272 if (r == TEST_RIGHT)
1280 if (direction == DIRECTION_UP) {
1282 subtract_one = true;
1293 a = le64toh(array->entry_array.next_entry_array_offset);
1299 if (subtract_one && t == 0 && i == 0)
1302 if (subtract_one && i == 0)
1304 else if (subtract_one)
1305 p = le64toh(array->entry_array.items[i-1]);
1307 p = le64toh(array->entry_array.items[i]);
1309 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1320 *idx = t + i + (subtract_one ? -1 : 0);
1325 static int generic_array_bisect_plus_one(JournalFile *f,
1330 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1331 direction_t direction,
1337 bool step_back = false;
1341 assert(test_object);
1346 /* This bisects the array in object 'first', but first checks
1348 r = test_object(f, extra, needle);
1352 if (r == TEST_FOUND)
1353 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1355 /* if we are looking with DIRECTION_UP then we need to first
1356 see if in the actual array there is a matching entry, and
1357 return the last one of that. But if there isn't any we need
1358 to return this one. Hence remember this, and return it
1361 step_back = direction == DIRECTION_UP;
1363 if (r == TEST_RIGHT) {
1364 if (direction == DIRECTION_DOWN)
1370 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1372 if (r == 0 && step_back)
1381 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1397 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1403 else if (p < needle)
1409 int journal_file_move_to_entry_by_offset(
1412 direction_t direction,
1416 return generic_array_bisect(f,
1417 le64toh(f->header->entry_array_offset),
1418 le64toh(f->header->n_entries),
1426 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1433 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1437 if (le64toh(o->entry.seqnum) == needle)
1439 else if (le64toh(o->entry.seqnum) < needle)
1445 int journal_file_move_to_entry_by_seqnum(
1448 direction_t direction,
1452 return generic_array_bisect(f,
1453 le64toh(f->header->entry_array_offset),
1454 le64toh(f->header->n_entries),
1461 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1468 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1472 if (le64toh(o->entry.realtime) == needle)
1474 else if (le64toh(o->entry.realtime) < needle)
1480 int journal_file_move_to_entry_by_realtime(
1483 direction_t direction,
1487 return generic_array_bisect(f,
1488 le64toh(f->header->entry_array_offset),
1489 le64toh(f->header->n_entries),
1491 test_object_realtime,
1496 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1503 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1507 if (le64toh(o->entry.monotonic) == needle)
1509 else if (le64toh(o->entry.monotonic) < needle)
1515 int journal_file_move_to_entry_by_monotonic(
1519 direction_t direction,
1523 char t[9+32+1] = "_BOOT_ID=";
1529 sd_id128_to_string(boot_id, t + 9);
1530 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1536 return generic_array_bisect_plus_one(f,
1537 le64toh(o->data.entry_offset),
1538 le64toh(o->data.entry_array_offset),
1539 le64toh(o->data.n_entries),
1541 test_object_monotonic,
1546 int journal_file_next_entry(
1548 Object *o, uint64_t p,
1549 direction_t direction,
1550 Object **ret, uint64_t *offset) {
1556 assert(p > 0 || !o);
1558 n = le64toh(f->header->n_entries);
1563 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1565 if (o->object.type != OBJECT_ENTRY)
1568 r = generic_array_bisect(f,
1569 le64toh(f->header->entry_array_offset),
1570 le64toh(f->header->n_entries),
1579 if (direction == DIRECTION_DOWN) {
1592 /* And jump to it */
1593 return generic_array_get(f,
1594 le64toh(f->header->entry_array_offset),
1599 int journal_file_skip_entry(
1601 Object *o, uint64_t p,
1603 Object **ret, uint64_t *offset) {
1612 if (o->object.type != OBJECT_ENTRY)
1615 r = generic_array_bisect(f,
1616 le64toh(f->header->entry_array_offset),
1617 le64toh(f->header->n_entries),
1626 /* Calculate new index */
1628 if ((uint64_t) -skip >= i)
1631 i = i - (uint64_t) -skip;
1633 i += (uint64_t) skip;
1635 n = le64toh(f->header->n_entries);
1642 return generic_array_get(f,
1643 le64toh(f->header->entry_array_offset),
1648 int journal_file_next_entry_for_data(
1650 Object *o, uint64_t p,
1651 uint64_t data_offset,
1652 direction_t direction,
1653 Object **ret, uint64_t *offset) {
1660 assert(p > 0 || !o);
1662 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1666 n = le64toh(d->data.n_entries);
1671 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1673 if (o->object.type != OBJECT_ENTRY)
1676 r = generic_array_bisect_plus_one(f,
1677 le64toh(d->data.entry_offset),
1678 le64toh(d->data.entry_array_offset),
1679 le64toh(d->data.n_entries),
1689 if (direction == DIRECTION_DOWN) {
1703 return generic_array_get_plus_one(f,
1704 le64toh(d->data.entry_offset),
1705 le64toh(d->data.entry_array_offset),
1710 int journal_file_move_to_entry_by_offset_for_data(
1712 uint64_t data_offset,
1714 direction_t direction,
1715 Object **ret, uint64_t *offset) {
1722 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1726 return generic_array_bisect_plus_one(f,
1727 le64toh(d->data.entry_offset),
1728 le64toh(d->data.entry_array_offset),
1729 le64toh(d->data.n_entries),
1736 int journal_file_move_to_entry_by_monotonic_for_data(
1738 uint64_t data_offset,
1741 direction_t direction,
1742 Object **ret, uint64_t *offset) {
1744 char t[9+32+1] = "_BOOT_ID=";
1751 /* First, seek by time */
1752 sd_id128_to_string(boot_id, t + 9);
1753 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1759 r = generic_array_bisect_plus_one(f,
1760 le64toh(o->data.entry_offset),
1761 le64toh(o->data.entry_array_offset),
1762 le64toh(o->data.n_entries),
1764 test_object_monotonic,
1770 /* And now, continue seeking until we find an entry that
1771 * exists in both bisection arrays */
1777 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1781 r = generic_array_bisect_plus_one(f,
1782 le64toh(d->data.entry_offset),
1783 le64toh(d->data.entry_array_offset),
1784 le64toh(d->data.n_entries),
1792 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1796 r = generic_array_bisect_plus_one(f,
1797 le64toh(o->data.entry_offset),
1798 le64toh(o->data.entry_array_offset),
1799 le64toh(o->data.n_entries),
1823 int journal_file_move_to_entry_by_seqnum_for_data(
1825 uint64_t data_offset,
1827 direction_t direction,
1828 Object **ret, uint64_t *offset) {
1835 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1839 return generic_array_bisect_plus_one(f,
1840 le64toh(d->data.entry_offset),
1841 le64toh(d->data.entry_array_offset),
1842 le64toh(d->data.n_entries),
1849 int journal_file_move_to_entry_by_realtime_for_data(
1851 uint64_t data_offset,
1853 direction_t direction,
1854 Object **ret, uint64_t *offset) {
1861 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1865 return generic_array_bisect_plus_one(f,
1866 le64toh(d->data.entry_offset),
1867 le64toh(d->data.entry_array_offset),
1868 le64toh(d->data.n_entries),
1870 test_object_realtime,
1875 static void *fsprg_state(JournalFile *f) {
1879 if (!f->authenticate)
1882 a = le64toh(f->fsprg_header->header_size);
1883 b = le64toh(f->fsprg_header->state_size);
1885 if (a + b > f->fsprg_size)
1888 return (uint8_t*) f->fsprg_header + a;
1891 static int journal_file_append_tag(JournalFile *f) {
1898 if (!f->authenticate)
1901 if (!f->hmac_running)
1904 log_debug("Writing tag for epoch %llu\n", (unsigned long long) FSPRG_GetEpoch(fsprg_state(f)));
1908 r = journal_file_append_object(f, OBJECT_TAG, sizeof(struct TagObject), &o, &p);
1912 /* Get the HMAC tag and store it in the object */
1913 memcpy(o->tag.tag, gcry_md_read(f->hmac, 0), TAG_LENGTH);
1914 f->hmac_running = false;
1919 static int journal_file_hmac_start(JournalFile *f) {
1920 uint8_t key[256 / 8]; /* Let's pass 256 bit from FSPRG to HMAC */
1924 if (!f->authenticate)
1927 if (f->hmac_running)
1930 /* Prepare HMAC for next cycle */
1931 gcry_md_reset(f->hmac);
1932 FSPRG_GetKey(fsprg_state(f), key, sizeof(key), 0);
1933 gcry_md_setkey(f->hmac, key, sizeof(key));
1935 f->hmac_running = true;
1940 static int journal_file_get_epoch(JournalFile *f, uint64_t realtime, uint64_t *epoch) {
1945 assert(f->authenticate);
1947 if (le64toh(f->fsprg_header->fsprg_start_usec) == 0 ||
1948 le64toh(f->fsprg_header->fsprg_interval_usec) == 0)
1951 if (realtime < le64toh(f->fsprg_header->fsprg_start_usec))
1954 t = realtime - le64toh(f->fsprg_header->fsprg_start_usec);
1955 t = t / le64toh(f->fsprg_header->fsprg_interval_usec);
1961 static int journal_file_need_evolve(JournalFile *f, uint64_t realtime) {
1962 uint64_t goal, epoch;
1966 if (!f->authenticate)
1969 r = journal_file_get_epoch(f, realtime, &goal);
1973 epoch = FSPRG_GetEpoch(fsprg_state(f));
1977 return epoch != goal;
1980 static int journal_file_evolve(JournalFile *f, uint64_t realtime) {
1981 uint64_t goal, epoch;
1986 if (!f->authenticate)
1989 r = journal_file_get_epoch(f, realtime, &goal);
1993 epoch = FSPRG_GetEpoch(fsprg_state(f));
1995 log_debug("Evolving FSPRG key from epoch %llu to %llu.", (unsigned long long) epoch, (unsigned long long) goal);
2003 FSPRG_Evolve(fsprg_state(f));
2004 epoch = FSPRG_GetEpoch(fsprg_state(f));
2008 static int journal_file_maybe_append_tag(JournalFile *f, uint64_t realtime) {
2013 if (!f->authenticate)
2016 r = journal_file_need_evolve(f, realtime);
2020 r = journal_file_append_tag(f);
2024 r = journal_file_evolve(f, realtime);
2028 r = journal_file_hmac_start(f);
2035 static int journal_file_hmac_put_object(JournalFile *f, int type, uint64_t p) {
2041 if (!f->authenticate)
2044 r = journal_file_hmac_start(f);
2048 r = journal_file_move_to_object(f, type, p, &o);
2052 gcry_md_write(f->hmac, o, offsetof(ObjectHeader, payload));
2054 switch (o->object.type) {
2057 /* All but: entry_array_offset, n_entries are mutable */
2058 gcry_md_write(f->hmac, &o->data.hash, offsetof(DataObject, entry_array_offset) - offsetof(DataObject, hash));
2059 gcry_md_write(f->hmac, o->data.payload, le64toh(o->object.size) - offsetof(DataObject, payload));
2064 gcry_md_write(f->hmac, &o->entry.seqnum, le64toh(o->object.size) - offsetof(EntryObject, seqnum));
2067 case OBJECT_FIELD_HASH_TABLE:
2068 case OBJECT_DATA_HASH_TABLE:
2069 case OBJECT_ENTRY_ARRAY:
2070 /* Nothing: everything is mutable */
2075 gcry_md_write(f->hmac, o->tag.tag, le64toh(o->object.size) - offsetof(TagObject, tag));
2085 static int journal_file_hmac_put_header(JournalFile *f) {
2090 if (!f->authenticate)
2093 r = journal_file_hmac_start(f);
2097 /* All but state+reserved, boot_id, arena_size,
2098 * tail_object_offset, n_objects, n_entries, tail_seqnum,
2099 * head_entry_realtime, tail_entry_realtime,
2100 * tail_entry_monotonic, n_data, n_fields, header_tag */
2102 gcry_md_write(f->hmac, f->header->signature, offsetof(Header, state) - offsetof(Header, signature));
2103 gcry_md_write(f->hmac, &f->header->file_id, offsetof(Header, boot_id) - offsetof(Header, file_id));
2104 gcry_md_write(f->hmac, &f->header->seqnum_id, offsetof(Header, arena_size) - offsetof(Header, seqnum_id));
2105 gcry_md_write(f->hmac, &f->header->data_hash_table_offset, offsetof(Header, tail_object_offset) - offsetof(Header, data_hash_table_offset));
2106 gcry_md_write(f->hmac, &f->header->head_seqnum, offsetof(Header, head_entry_realtime) - offsetof(Header, head_seqnum));
2111 static int journal_file_load_fsprg(JournalFile *f) {
2115 FSPRGHeader *m = NULL;
2120 if (!f->authenticate)
2123 r = sd_id128_get_machine(&machine);
2127 if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/fsprg",
2128 SD_ID128_FORMAT_VAL(machine)) < 0)
2131 fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY, 0600);
2133 log_error("Failed to open %s: %m", p);
2138 if (fstat(fd, &st) < 0) {
2143 if (st.st_size < (off_t) sizeof(FSPRGHeader)) {
2148 m = mmap(NULL, PAGE_ALIGN(sizeof(FSPRGHeader)), PROT_READ, MAP_SHARED, fd, 0);
2149 if (m == MAP_FAILED) {
2155 if (memcmp(m->signature, FSPRG_HEADER_SIGNATURE, 8) != 0) {
2160 if (m->incompatible_flags != 0) {
2161 r = -EPROTONOSUPPORT;
2165 if (le64toh(m->header_size) < sizeof(FSPRGHeader)) {
2170 if (le64toh(m->state_size) != FSPRG_stateinbytes(m->secpar)) {
2175 f->fsprg_size = le64toh(m->header_size) + le64toh(m->state_size);
2176 if ((uint64_t) st.st_size < f->fsprg_size) {
2181 if (!sd_id128_equal(machine, m->machine_id)) {
2186 if (le64toh(m->fsprg_start_usec) <= 0 ||
2187 le64toh(m->fsprg_interval_usec) <= 0) {
2192 f->fsprg_header = mmap(NULL, PAGE_ALIGN(f->fsprg_size), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2193 if (f->fsprg_header == MAP_FAILED) {
2194 f->fsprg_header = NULL;
2203 munmap(m, PAGE_ALIGN(sizeof(FSPRGHeader)));
2206 close_nointr_nofail(fd);
2212 static int journal_file_setup_hmac(JournalFile *f) {
2215 if (!f->authenticate)
2218 e = gcry_md_open(&f->hmac, GCRY_MD_SHA256, GCRY_MD_FLAG_HMAC);
2225 static int journal_file_append_first_tag(JournalFile *f) {
2229 if (!f->authenticate)
2232 log_debug("Calculating first tag...");
2234 r = journal_file_hmac_put_header(f);
2238 p = le64toh(f->header->field_hash_table_offset);
2239 if (p < offsetof(Object, hash_table.items))
2241 p -= offsetof(Object, hash_table.items);
2243 r = journal_file_hmac_put_object(f, OBJECT_FIELD_HASH_TABLE, p);
2247 p = le64toh(f->header->data_hash_table_offset);
2248 if (p < offsetof(Object, hash_table.items))
2250 p -= offsetof(Object, hash_table.items);
2252 r = journal_file_hmac_put_object(f, OBJECT_DATA_HASH_TABLE, p);
2256 r = journal_file_append_tag(f);
2263 void journal_file_dump(JournalFile *f) {
2270 journal_file_print_header(f);
2272 p = le64toh(f->header->header_size);
2274 r = journal_file_move_to_object(f, -1, p, &o);
2278 switch (o->object.type) {
2281 printf("Type: OBJECT_UNUSED\n");
2285 printf("Type: OBJECT_DATA\n");
2289 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
2290 (unsigned long long) le64toh(o->entry.seqnum),
2291 (unsigned long long) le64toh(o->entry.monotonic),
2292 (unsigned long long) le64toh(o->entry.realtime));
2295 case OBJECT_FIELD_HASH_TABLE:
2296 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2299 case OBJECT_DATA_HASH_TABLE:
2300 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2303 case OBJECT_ENTRY_ARRAY:
2304 printf("Type: OBJECT_ENTRY_ARRAY\n");
2308 printf("Type: OBJECT_TAG\n");
2312 if (o->object.flags & OBJECT_COMPRESSED)
2313 printf("Flags: COMPRESSED\n");
2315 if (p == le64toh(f->header->tail_object_offset))
2318 p = p + ALIGN64(le64toh(o->object.size));
2323 log_error("File corrupt");
2326 void journal_file_print_header(JournalFile *f) {
2327 char a[33], b[33], c[33];
2328 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2332 printf("File Path: %s\n"
2336 "Sequential Number ID: %s\n"
2338 "Compatible Flags:%s%s\n"
2339 "Incompatible Flags:%s%s\n"
2340 "Header size: %llu\n"
2341 "Arena size: %llu\n"
2342 "Data Hash Table Size: %llu\n"
2343 "Field Hash Table Size: %llu\n"
2345 "Entry Objects: %llu\n"
2346 "Rotate Suggested: %s\n"
2347 "Head Sequential Number: %llu\n"
2348 "Tail Sequential Number: %llu\n"
2349 "Head Realtime Timestamp: %s\n"
2350 "Tail Realtime Timestamp: %s\n",
2352 sd_id128_to_string(f->header->file_id, a),
2353 sd_id128_to_string(f->header->machine_id, b),
2354 sd_id128_to_string(f->header->boot_id, c),
2355 sd_id128_to_string(f->header->seqnum_id, c),
2356 f->header->state == STATE_OFFLINE ? "offline" :
2357 f->header->state == STATE_ONLINE ? "online" :
2358 f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
2359 (f->header->compatible_flags & HEADER_COMPATIBLE_AUTHENTICATED) ? " AUTHENTICATED" : "",
2360 (f->header->compatible_flags & ~HEADER_COMPATIBLE_AUTHENTICATED) ? " ???" : "",
2361 (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
2362 (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2363 (unsigned long long) le64toh(f->header->header_size),
2364 (unsigned long long) le64toh(f->header->arena_size),
2365 (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2366 (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2367 (unsigned long long) le64toh(f->header->n_objects),
2368 (unsigned long long) le64toh(f->header->n_entries),
2369 yes_no(journal_file_rotate_suggested(f)),
2370 (unsigned long long) le64toh(f->header->head_seqnum),
2371 (unsigned long long) le64toh(f->header->tail_seqnum),
2372 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2373 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
2375 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2376 printf("Data Objects: %llu\n"
2377 "Data Hash Table Fill: %.1f%%\n",
2378 (unsigned long long) le64toh(f->header->n_data),
2379 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2381 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2382 printf("Field Objects: %llu\n"
2383 "Field Hash Table Fill: %.1f%%\n",
2384 (unsigned long long) le64toh(f->header->n_fields),
2385 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2388 int journal_file_open(
2394 JournalMetrics *metrics,
2395 JournalFile *template,
2396 JournalFile **ret) {
2400 bool newly_created = false;
2404 if ((flags & O_ACCMODE) != O_RDONLY &&
2405 (flags & O_ACCMODE) != O_RDWR)
2408 if (!endswith(fname, ".journal"))
2411 f = new0(JournalFile, 1);
2419 f->prot = prot_from_flags(flags);
2420 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2421 f->compress = compress;
2422 f->authenticate = authenticate;
2424 f->path = strdup(fname);
2430 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2436 if (fstat(f->fd, &f->last_stat) < 0) {
2441 if (f->last_stat.st_size == 0 && f->writable) {
2442 newly_created = true;
2444 /* Try to load the FSPRG state, and if we can't, then
2445 * just don't do authentication */
2446 r = journal_file_load_fsprg(f);
2448 f->authenticate = false;
2450 r = journal_file_init_header(f, template);
2454 if (fstat(f->fd, &f->last_stat) < 0) {
2460 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2465 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2466 if (f->header == MAP_FAILED) {
2472 if (!newly_created) {
2473 r = journal_file_verify_header(f);
2477 r = journal_file_load_fsprg(f);
2484 journal_default_metrics(metrics, f->fd);
2485 f->metrics = *metrics;
2486 } else if (template)
2487 f->metrics = template->metrics;
2489 r = journal_file_refresh_header(f);
2493 r = journal_file_setup_hmac(f);
2498 if (newly_created) {
2499 r = journal_file_setup_field_hash_table(f);
2503 r = journal_file_setup_data_hash_table(f);
2507 r = journal_file_append_first_tag(f);
2512 r = journal_file_map_field_hash_table(f);
2516 r = journal_file_map_data_hash_table(f);
2526 journal_file_close(f);
2531 int journal_file_rotate(JournalFile **f, bool compress, bool authenticate) {
2534 JournalFile *old_file, *new_file = NULL;
2542 if (!old_file->writable)
2545 if (!endswith(old_file->path, ".journal"))
2548 l = strlen(old_file->path);
2550 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2554 memcpy(p, old_file->path, l - 8);
2556 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2557 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2558 "-%016llx-%016llx.journal",
2559 (unsigned long long) le64toh((*f)->header->tail_seqnum),
2560 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2562 r = rename(old_file->path, p);
2568 old_file->header->state = STATE_ARCHIVED;
2570 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, authenticate, NULL, old_file, &new_file);
2571 journal_file_close(old_file);
2577 int journal_file_open_reliably(
2583 JournalMetrics *metrics,
2584 JournalFile *template,
2585 JournalFile **ret) {
2591 r = journal_file_open(fname, flags, mode, compress, authenticate, metrics, template, ret);
2592 if (r != -EBADMSG && /* corrupted */
2593 r != -ENODATA && /* truncated */
2594 r != -EHOSTDOWN && /* other machine */
2595 r != -EPROTONOSUPPORT && /* incompatible feature */
2596 r != -EBUSY && /* unclean shutdown */
2597 r != -ESHUTDOWN /* already archived */)
2600 if ((flags & O_ACCMODE) == O_RDONLY)
2603 if (!(flags & O_CREAT))
2606 if (!endswith(fname, ".journal"))
2609 /* The file is corrupted. Rotate it away and try it again (but only once) */
2612 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2614 (unsigned long long) now(CLOCK_REALTIME),
2618 r = rename(fname, p);
2623 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2625 return journal_file_open(fname, flags, mode, compress, authenticate, metrics, template, ret);
2628 struct vacuum_info {
2633 sd_id128_t seqnum_id;
2639 static int vacuum_compare(const void *_a, const void *_b) {
2640 const struct vacuum_info *a, *b;
2645 if (a->have_seqnum && b->have_seqnum &&
2646 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2647 if (a->seqnum < b->seqnum)
2649 else if (a->seqnum > b->seqnum)
2655 if (a->realtime < b->realtime)
2657 else if (a->realtime > b->realtime)
2659 else if (a->have_seqnum && b->have_seqnum)
2660 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2662 return strcmp(a->filename, b->filename);
2665 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2668 struct vacuum_info *list = NULL;
2669 unsigned n_list = 0, n_allocated = 0, i;
2677 d = opendir(directory);
2683 struct dirent buf, *de;
2687 unsigned long long seqnum = 0, realtime;
2688 sd_id128_t seqnum_id;
2691 k = readdir_r(d, &buf, &de);
2700 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2703 if (!S_ISREG(st.st_mode))
2706 q = strlen(de->d_name);
2708 if (endswith(de->d_name, ".journal")) {
2710 /* Vacuum archived files */
2712 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2715 if (de->d_name[q-8-16-1] != '-' ||
2716 de->d_name[q-8-16-1-16-1] != '-' ||
2717 de->d_name[q-8-16-1-16-1-32-1] != '@')
2720 p = strdup(de->d_name);
2726 de->d_name[q-8-16-1-16-1] = 0;
2727 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2732 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2739 } else if (endswith(de->d_name, ".journal~")) {
2740 unsigned long long tmp;
2742 /* Vacuum corrupted files */
2744 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2747 if (de->d_name[q-1-8-16-1] != '-' ||
2748 de->d_name[q-1-8-16-1-16-1] != '@')
2751 p = strdup(de->d_name);
2757 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2762 have_seqnum = false;
2766 if (n_list >= n_allocated) {
2767 struct vacuum_info *j;
2769 n_allocated = MAX(n_allocated * 2U, 8U);
2770 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2780 list[n_list].filename = p;
2781 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2782 list[n_list].seqnum = seqnum;
2783 list[n_list].realtime = realtime;
2784 list[n_list].seqnum_id = seqnum_id;
2785 list[n_list].have_seqnum = have_seqnum;
2787 sum += list[n_list].usage;
2793 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2795 for(i = 0; i < n_list; i++) {
2798 if (fstatvfs(dirfd(d), &ss) < 0) {
2803 if (sum <= max_use &&
2804 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2807 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2808 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2809 sum -= list[i].usage;
2810 } else if (errno != ENOENT)
2811 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2815 for (i = 0; i < n_list; i++)
2816 free(list[i].filename);
2826 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2828 uint64_t q, xor_hash = 0;
2841 ts.monotonic = le64toh(o->entry.monotonic);
2842 ts.realtime = le64toh(o->entry.realtime);
2844 if (to->tail_entry_monotonic_valid &&
2845 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2848 n = journal_file_entry_n_items(o);
2849 items = alloca(sizeof(EntryItem) * n);
2851 for (i = 0; i < n; i++) {
2858 q = le64toh(o->entry.items[i].object_offset);
2859 le_hash = o->entry.items[i].hash;
2861 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2865 if (le_hash != o->data.hash)
2868 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2871 /* We hit the limit on 32bit machines */
2872 if ((uint64_t) t != l)
2875 if (o->object.flags & OBJECT_COMPRESSED) {
2879 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2882 data = from->compress_buffer;
2885 return -EPROTONOSUPPORT;
2888 data = o->data.payload;
2890 r = journal_file_append_data(to, data, l, &u, &h);
2894 xor_hash ^= le64toh(u->data.hash);
2895 items[i].object_offset = htole64(h);
2896 items[i].hash = u->data.hash;
2898 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2903 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2906 void journal_default_metrics(JournalMetrics *m, int fd) {
2907 uint64_t fs_size = 0;
2909 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2914 if (fstatvfs(fd, &ss) >= 0)
2915 fs_size = ss.f_frsize * ss.f_blocks;
2917 if (m->max_use == (uint64_t) -1) {
2920 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2922 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2923 m->max_use = DEFAULT_MAX_USE_UPPER;
2925 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2926 m->max_use = DEFAULT_MAX_USE_LOWER;
2928 m->max_use = DEFAULT_MAX_USE_LOWER;
2930 m->max_use = PAGE_ALIGN(m->max_use);
2932 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2933 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2936 if (m->max_size == (uint64_t) -1) {
2937 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2939 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2940 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2942 m->max_size = PAGE_ALIGN(m->max_size);
2944 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2945 m->max_size = JOURNAL_FILE_SIZE_MIN;
2947 if (m->max_size*2 > m->max_use)
2948 m->max_use = m->max_size*2;
2950 if (m->min_size == (uint64_t) -1)
2951 m->min_size = JOURNAL_FILE_SIZE_MIN;
2953 m->min_size = PAGE_ALIGN(m->min_size);
2955 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2956 m->min_size = JOURNAL_FILE_SIZE_MIN;
2958 if (m->min_size > m->max_size)
2959 m->max_size = m->min_size;
2962 if (m->keep_free == (uint64_t) -1) {
2965 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2967 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2968 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2971 m->keep_free = DEFAULT_KEEP_FREE;
2974 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2975 format_bytes(a, sizeof(a), m->max_use),
2976 format_bytes(b, sizeof(b), m->max_size),
2977 format_bytes(c, sizeof(c), m->min_size),
2978 format_bytes(d, sizeof(d), m->keep_free));
2981 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2986 if (f->header->head_entry_realtime == 0)
2989 *from = le64toh(f->header->head_entry_realtime);
2993 if (f->header->tail_entry_realtime == 0)
2996 *to = le64toh(f->header->tail_entry_realtime);
3002 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3003 char t[9+32+1] = "_BOOT_ID=";
3011 sd_id128_to_string(boot_id, t + 9);
3013 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
3017 if (le64toh(o->data.n_entries) <= 0)
3021 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3025 *from = le64toh(o->entry.monotonic);
3029 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3033 r = generic_array_get_plus_one(f,
3034 le64toh(o->data.entry_offset),
3035 le64toh(o->data.entry_array_offset),
3036 le64toh(o->data.n_entries)-1,
3041 *to = le64toh(o->entry.monotonic);
3047 bool journal_file_rotate_suggested(JournalFile *f) {
3050 /* If we gained new header fields we gained new features,
3051 * hence suggest a rotation */
3052 if (le64toh(f->header->header_size) < sizeof(Header)) {
3053 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3057 /* Let's check if the hash tables grew over a certain fill
3058 * level (75%, borrowing this value from Java's hash table
3059 * implementation), and if so suggest a rotation. To calculate
3060 * the fill level we need the n_data field, which only exists
3061 * in newer versions. */
3063 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3064 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3065 log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
3067 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3068 (unsigned long long) le64toh(f->header->n_data),
3069 (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
3070 (unsigned long long) (f->last_stat.st_size),
3071 (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
3075 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3076 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3077 log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
3079 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3080 (unsigned long long) le64toh(f->header->n_fields),
3081 (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));