1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
29 #include <sys/xattr.h>
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 static int journal_file_set_online(JournalFile *f) {
77 if (!(f->fd >= 0 && f->header))
80 switch(f->header->state) {
85 f->header->state = STATE_ONLINE;
94 int journal_file_set_offline(JournalFile *f) {
100 if (!(f->fd >= 0 && f->header))
103 if (f->header->state != STATE_ONLINE)
108 f->header->state = STATE_OFFLINE;
115 void journal_file_close(JournalFile *f) {
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
128 journal_file_set_offline(f);
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 mmap_cache_unref(f->mmap);
139 ordered_hashmap_free_free(f->chain_cache);
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142 free(f->compress_buffer);
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
154 gcry_md_close(f->hmac);
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
167 memcpy(h.signature, HEADER_SIGNATURE, 8);
168 h.header_size = htole64(ALIGN64(sizeof(h)));
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
177 r = sd_id128_randomize(&h.file_id);
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
185 h.seqnum_id = h.file_id;
187 k = pwrite(f->fd, &h, sizeof(h), 0);
197 static int journal_file_refresh_header(JournalFile *f) {
203 r = sd_id128_get_machine(&f->header->machine_id);
207 r = sd_id128_get_boot(&boot_id);
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
214 f->header->boot_id = boot_id;
216 journal_file_set_online(f);
218 /* Sync the online state to disk */
224 static int journal_file_verify_header(JournalFile *f) {
229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
243 return -EPROTONOSUPPORT;
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
260 if (f->header->state >= _STATE_MAX)
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
284 sd_id128_t machine_id;
287 r = sd_id128_get_machine(&machine_id);
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
294 state = f->header->state;
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
299 } else if (state == STATE_ARCHIVED)
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
333 if (new_size <= old_size)
336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342 if (fstatvfs(f->fd, &svfs) >= 0) {
345 available = svfs.f_bfree * svfs.f_bsize;
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
352 if (new_size - old_size > available)
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
369 if (fstat(f->fd, &f->last_stat) < 0)
372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
377 static unsigned type_to_context(ObjectType type) {
378 /* One context for each type, plus one catch-all for the rest */
379 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
380 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
383 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
390 /* Avoid SIGBUS on invalid accesses */
391 if (offset + size > (uint64_t) f->last_stat.st_size) {
392 /* Hmm, out of range? Let's refresh the fstat() data
393 * first, before we trust that check. */
395 if (fstat(f->fd, &f->last_stat) < 0 ||
396 offset + size > (uint64_t) f->last_stat.st_size)
397 return -EADDRNOTAVAIL;
400 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
403 static uint64_t minimum_header_size(Object *o) {
405 static const uint64_t table[] = {
406 [OBJECT_DATA] = sizeof(DataObject),
407 [OBJECT_FIELD] = sizeof(FieldObject),
408 [OBJECT_ENTRY] = sizeof(EntryObject),
409 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
411 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
412 [OBJECT_TAG] = sizeof(TagObject),
415 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
416 return sizeof(ObjectHeader);
418 return table[o->object.type];
421 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
434 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
439 s = le64toh(o->object.size);
441 if (s < sizeof(ObjectHeader))
444 if (o->object.type <= OBJECT_UNUSED)
447 if (s < minimum_header_size(o))
450 if (type > OBJECT_UNUSED && o->object.type != type)
453 if (s > sizeof(ObjectHeader)) {
454 r = journal_file_move_to(f, type, false, offset, s, &t);
465 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
470 r = le64toh(f->header->tail_entry_seqnum) + 1;
473 /* If an external seqnum counter was passed, we update
474 * both the local and the external one, and set it to
475 * the maximum of both */
483 f->header->tail_entry_seqnum = htole64(r);
485 if (f->header->head_entry_seqnum == 0)
486 f->header->head_entry_seqnum = htole64(r);
491 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
498 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
499 assert(size >= sizeof(ObjectHeader));
503 r = journal_file_set_online(f);
507 p = le64toh(f->header->tail_object_offset);
509 p = le64toh(f->header->header_size);
511 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
515 p += ALIGN64(le64toh(tail->object.size));
518 r = journal_file_allocate(f, p, size);
522 r = journal_file_move_to(f, type, false, p, size, &t);
529 o->object.type = type;
530 o->object.size = htole64(size);
532 f->header->tail_object_offset = htole64(p);
533 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
541 static int journal_file_setup_data_hash_table(JournalFile *f) {
548 /* We estimate that we need 1 hash table entry per 768 of
549 journal file and we want to make sure we never get beyond
550 75% fill level. Calculate the hash table size for the
551 maximum file size based on these metrics. */
553 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
554 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555 s = DEFAULT_DATA_HASH_TABLE_SIZE;
557 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
559 r = journal_file_append_object(f,
560 OBJECT_DATA_HASH_TABLE,
561 offsetof(Object, hash_table.items) + s,
566 memzero(o->hash_table.items, s);
568 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569 f->header->data_hash_table_size = htole64(s);
574 static int journal_file_setup_field_hash_table(JournalFile *f) {
581 /* We use a fixed size hash table for the fields as this
582 * number should grow very slowly only */
584 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585 r = journal_file_append_object(f,
586 OBJECT_FIELD_HASH_TABLE,
587 offsetof(Object, hash_table.items) + s,
592 memzero(o->hash_table.items, s);
594 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595 f->header->field_hash_table_size = htole64(s);
600 static int journal_file_map_data_hash_table(JournalFile *f) {
607 p = le64toh(f->header->data_hash_table_offset);
608 s = le64toh(f->header->data_hash_table_size);
610 r = journal_file_move_to(f,
611 OBJECT_DATA_HASH_TABLE,
618 f->data_hash_table = t;
622 static int journal_file_map_field_hash_table(JournalFile *f) {
629 p = le64toh(f->header->field_hash_table_offset);
630 s = le64toh(f->header->field_hash_table_size);
632 r = journal_file_move_to(f,
633 OBJECT_FIELD_HASH_TABLE,
640 f->field_hash_table = t;
644 static int journal_file_link_field(
657 if (o->object.type != OBJECT_FIELD)
660 /* This might alter the window we are looking at */
662 o->field.next_hash_offset = o->field.head_data_offset = 0;
664 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665 p = le64toh(f->field_hash_table[h].tail_hash_offset);
667 f->field_hash_table[h].head_hash_offset = htole64(offset);
669 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
673 o->field.next_hash_offset = htole64(offset);
676 f->field_hash_table[h].tail_hash_offset = htole64(offset);
678 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
684 static int journal_file_link_data(
697 if (o->object.type != OBJECT_DATA)
700 /* This might alter the window we are looking at */
702 o->data.next_hash_offset = o->data.next_field_offset = 0;
703 o->data.entry_offset = o->data.entry_array_offset = 0;
704 o->data.n_entries = 0;
706 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
707 p = le64toh(f->data_hash_table[h].tail_hash_offset);
709 /* Only entry in the hash table is easy */
710 f->data_hash_table[h].head_hash_offset = htole64(offset);
712 /* Move back to the previous data object, to patch in
715 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
719 o->data.next_hash_offset = htole64(offset);
722 f->data_hash_table[h].tail_hash_offset = htole64(offset);
724 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
730 int journal_file_find_field_object_with_hash(
732 const void *field, uint64_t size, uint64_t hash,
733 Object **ret, uint64_t *offset) {
735 uint64_t p, osize, h;
739 assert(field && size > 0);
741 osize = offsetof(Object, field.payload) + size;
743 if (f->header->field_hash_table_size == 0)
746 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747 p = le64toh(f->field_hash_table[h].head_hash_offset);
752 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
756 if (le64toh(o->field.hash) == hash &&
757 le64toh(o->object.size) == osize &&
758 memcmp(o->field.payload, field, size) == 0) {
768 p = le64toh(o->field.next_hash_offset);
774 int journal_file_find_field_object(
776 const void *field, uint64_t size,
777 Object **ret, uint64_t *offset) {
782 assert(field && size > 0);
784 hash = hash64(field, size);
786 return journal_file_find_field_object_with_hash(f,
791 int journal_file_find_data_object_with_hash(
793 const void *data, uint64_t size, uint64_t hash,
794 Object **ret, uint64_t *offset) {
796 uint64_t p, osize, h;
800 assert(data || size == 0);
802 osize = offsetof(Object, data.payload) + size;
804 if (f->header->data_hash_table_size == 0)
807 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808 p = le64toh(f->data_hash_table[h].head_hash_offset);
813 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
817 if (le64toh(o->data.hash) != hash)
820 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
821 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
825 l = le64toh(o->object.size);
826 if (l <= offsetof(Object, data.payload))
829 l -= offsetof(Object, data.payload);
831 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
832 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
837 memcmp(f->compress_buffer, data, size) == 0) {
848 return -EPROTONOSUPPORT;
850 } else if (le64toh(o->object.size) == osize &&
851 memcmp(o->data.payload, data, size) == 0) {
863 p = le64toh(o->data.next_hash_offset);
869 int journal_file_find_data_object(
871 const void *data, uint64_t size,
872 Object **ret, uint64_t *offset) {
877 assert(data || size == 0);
879 hash = hash64(data, size);
881 return journal_file_find_data_object_with_hash(f,
886 static int journal_file_append_field(
888 const void *field, uint64_t size,
889 Object **ret, uint64_t *offset) {
897 assert(field && size > 0);
899 hash = hash64(field, size);
901 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
915 osize = offsetof(Object, field.payload) + size;
916 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
920 o->field.hash = htole64(hash);
921 memcpy(o->field.payload, field, size);
923 r = journal_file_link_field(f, o, p, hash);
927 /* The linking might have altered the window, so let's
928 * refresh our pointer */
929 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
934 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
948 static int journal_file_append_data(
950 const void *data, uint64_t size,
951 Object **ret, uint64_t *offset) {
956 int r, compression = 0;
960 assert(data || size == 0);
962 hash = hash64(data, size);
964 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
978 osize = offsetof(Object, data.payload) + size;
979 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
983 o->data.hash = htole64(hash);
985 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
986 if (f->compress_xz &&
987 size >= COMPRESSION_SIZE_THRESHOLD) {
990 compression = compress_blob(data, size, o->data.payload, &rsize);
993 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
994 o->object.flags |= compression;
996 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
997 size, rsize, object_compressed_to_string(compression));
1002 if (!compression && size > 0)
1003 memcpy(o->data.payload, data, size);
1005 r = journal_file_link_data(f, o, p, hash);
1009 /* The linking might have altered the window, so let's
1010 * refresh our pointer */
1011 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1018 eq = memchr(data, '=', size);
1019 if (eq && eq > data) {
1023 /* Create field object ... */
1024 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1028 /* ... and link it in. */
1029 o->data.next_field_offset = fo->field.head_data_offset;
1030 fo->field.head_data_offset = le64toh(p);
1034 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1048 uint64_t journal_file_entry_n_items(Object *o) {
1051 if (o->object.type != OBJECT_ENTRY)
1054 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1057 uint64_t journal_file_entry_array_n_items(Object *o) {
1060 if (o->object.type != OBJECT_ENTRY_ARRAY)
1063 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1066 uint64_t journal_file_hash_table_n_items(Object *o) {
1069 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1070 o->object.type != OBJECT_FIELD_HASH_TABLE)
1073 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1076 static int link_entry_into_array(JournalFile *f,
1081 uint64_t n = 0, ap = 0, q, i, a, hidx;
1089 a = le64toh(*first);
1090 i = hidx = le64toh(*idx);
1093 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1097 n = journal_file_entry_array_n_items(o);
1099 o->entry_array.items[i] = htole64(p);
1100 *idx = htole64(hidx + 1);
1106 a = le64toh(o->entry_array.next_entry_array_offset);
1117 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1118 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1124 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1129 o->entry_array.items[i] = htole64(p);
1132 *first = htole64(q);
1134 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1138 o->entry_array.next_entry_array_offset = htole64(q);
1141 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1142 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1144 *idx = htole64(hidx + 1);
1149 static int link_entry_into_array_plus_one(JournalFile *f,
1164 *extra = htole64(p);
1168 i = htole64(le64toh(*idx) - 1);
1169 r = link_entry_into_array(f, first, &i, p);
1174 *idx = htole64(le64toh(*idx) + 1);
1178 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1185 p = le64toh(o->entry.items[i].object_offset);
1189 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1193 return link_entry_into_array_plus_one(f,
1194 &o->data.entry_offset,
1195 &o->data.entry_array_offset,
1200 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1208 if (o->object.type != OBJECT_ENTRY)
1211 __sync_synchronize();
1213 /* Link up the entry itself */
1214 r = link_entry_into_array(f,
1215 &f->header->entry_array_offset,
1216 &f->header->n_entries,
1221 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1223 if (f->header->head_entry_realtime == 0)
1224 f->header->head_entry_realtime = o->entry.realtime;
1226 f->header->tail_entry_realtime = o->entry.realtime;
1227 f->header->tail_entry_monotonic = o->entry.monotonic;
1229 f->tail_entry_monotonic_valid = true;
1231 /* Link up the items */
1232 n = journal_file_entry_n_items(o);
1233 for (i = 0; i < n; i++) {
1234 r = journal_file_link_entry_item(f, o, offset, i);
1242 static int journal_file_append_entry_internal(
1244 const dual_timestamp *ts,
1246 const EntryItem items[], unsigned n_items,
1248 Object **ret, uint64_t *offset) {
1255 assert(items || n_items == 0);
1258 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1260 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1264 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1265 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1266 o->entry.realtime = htole64(ts->realtime);
1267 o->entry.monotonic = htole64(ts->monotonic);
1268 o->entry.xor_hash = htole64(xor_hash);
1269 o->entry.boot_id = f->header->boot_id;
1272 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1277 r = journal_file_link_entry(f, o, np);
1290 void journal_file_post_change(JournalFile *f) {
1293 /* inotify() does not receive IN_MODIFY events from file
1294 * accesses done via mmap(). After each access we hence
1295 * trigger IN_MODIFY by truncating the journal file to its
1296 * current size which triggers IN_MODIFY. */
1298 __sync_synchronize();
1300 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1301 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1304 static int entry_item_cmp(const void *_a, const void *_b) {
1305 const EntryItem *a = _a, *b = _b;
1307 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1309 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1314 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1318 uint64_t xor_hash = 0;
1319 struct dual_timestamp _ts;
1322 assert(iovec || n_iovec == 0);
1325 dual_timestamp_get(&_ts);
1329 if (f->tail_entry_monotonic_valid &&
1330 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1334 r = journal_file_maybe_append_tag(f, ts->realtime);
1339 /* alloca() can't take 0, hence let's allocate at least one */
1340 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1342 for (i = 0; i < n_iovec; i++) {
1346 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1350 xor_hash ^= le64toh(o->data.hash);
1351 items[i].object_offset = htole64(p);
1352 items[i].hash = o->data.hash;
1355 /* Order by the position on disk, in order to improve seek
1356 * times for rotating media. */
1357 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1359 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1361 journal_file_post_change(f);
1366 typedef struct ChainCacheItem {
1367 uint64_t first; /* the array at the beginning of the chain */
1368 uint64_t array; /* the cached array */
1369 uint64_t begin; /* the first item in the cached array */
1370 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1371 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1374 static void chain_cache_put(
1381 uint64_t last_index) {
1384 /* If the chain item to cache for this chain is the
1385 * first one it's not worth caching anything */
1389 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1390 ci = ordered_hashmap_steal_first(h);
1393 ci = new(ChainCacheItem, 1);
1400 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1405 assert(ci->first == first);
1410 ci->last_index = last_index;
1413 static int generic_array_get(
1417 Object **ret, uint64_t *offset) {
1420 uint64_t p = 0, a, t = 0;
1428 /* Try the chain cache first */
1429 ci = ordered_hashmap_get(f->chain_cache, &first);
1430 if (ci && i > ci->total) {
1439 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1443 k = journal_file_entry_array_n_items(o);
1445 p = le64toh(o->entry_array.items[i]);
1451 a = le64toh(o->entry_array.next_entry_array_offset);
1457 /* Let's cache this item for the next invocation */
1458 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1460 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1473 static int generic_array_get_plus_one(
1478 Object **ret, uint64_t *offset) {
1487 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1500 return generic_array_get(f, first, i-1, ret, offset);
1509 static int generic_array_bisect(
1514 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1515 direction_t direction,
1520 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1521 bool subtract_one = false;
1522 Object *o, *array = NULL;
1527 assert(test_object);
1529 /* Start with the first array in the chain */
1532 ci = ordered_hashmap_get(f->chain_cache, &first);
1533 if (ci && n > ci->total) {
1534 /* Ah, we have iterated this bisection array chain
1535 * previously! Let's see if we can skip ahead in the
1536 * chain, as far as the last time. But we can't jump
1537 * backwards in the chain, so let's check that
1540 r = test_object(f, ci->begin, needle);
1544 if (r == TEST_LEFT) {
1545 /* OK, what we are looking for is right of the
1546 * begin of this EntryArray, so let's jump
1547 * straight to previously cached array in the
1553 last_index = ci->last_index;
1558 uint64_t left, right, k, lp;
1560 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1564 k = journal_file_entry_array_n_items(array);
1570 lp = p = le64toh(array->entry_array.items[i]);
1574 r = test_object(f, p, needle);
1578 if (r == TEST_FOUND)
1579 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1581 if (r == TEST_RIGHT) {
1585 if (last_index != (uint64_t) -1) {
1586 assert(last_index <= right);
1588 /* If we cached the last index we
1589 * looked at, let's try to not to jump
1590 * too wildly around and see if we can
1591 * limit the range to look at early to
1592 * the immediate neighbors of the last
1593 * index we looked at. */
1595 if (last_index > 0) {
1596 uint64_t x = last_index - 1;
1598 p = le64toh(array->entry_array.items[x]);
1602 r = test_object(f, p, needle);
1606 if (r == TEST_FOUND)
1607 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1609 if (r == TEST_RIGHT)
1615 if (last_index < right) {
1616 uint64_t y = last_index + 1;
1618 p = le64toh(array->entry_array.items[y]);
1622 r = test_object(f, p, needle);
1626 if (r == TEST_FOUND)
1627 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1629 if (r == TEST_RIGHT)
1637 if (left == right) {
1638 if (direction == DIRECTION_UP)
1639 subtract_one = true;
1645 assert(left < right);
1646 i = (left + right) / 2;
1648 p = le64toh(array->entry_array.items[i]);
1652 r = test_object(f, p, needle);
1656 if (r == TEST_FOUND)
1657 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1659 if (r == TEST_RIGHT)
1667 if (direction == DIRECTION_UP) {
1669 subtract_one = true;
1680 last_index = (uint64_t) -1;
1681 a = le64toh(array->entry_array.next_entry_array_offset);
1687 if (subtract_one && t == 0 && i == 0)
1690 /* Let's cache this item for the next invocation */
1691 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1693 if (subtract_one && i == 0)
1695 else if (subtract_one)
1696 p = le64toh(array->entry_array.items[i-1]);
1698 p = le64toh(array->entry_array.items[i]);
1700 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1711 *idx = t + i + (subtract_one ? -1 : 0);
1717 static int generic_array_bisect_plus_one(
1723 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1724 direction_t direction,
1730 bool step_back = false;
1734 assert(test_object);
1739 /* This bisects the array in object 'first', but first checks
1741 r = test_object(f, extra, needle);
1745 if (r == TEST_FOUND)
1746 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1748 /* if we are looking with DIRECTION_UP then we need to first
1749 see if in the actual array there is a matching entry, and
1750 return the last one of that. But if there isn't any we need
1751 to return this one. Hence remember this, and return it
1754 step_back = direction == DIRECTION_UP;
1756 if (r == TEST_RIGHT) {
1757 if (direction == DIRECTION_DOWN)
1763 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1765 if (r == 0 && step_back)
1774 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1790 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1796 else if (p < needle)
1802 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1809 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1813 if (le64toh(o->entry.seqnum) == needle)
1815 else if (le64toh(o->entry.seqnum) < needle)
1821 int journal_file_move_to_entry_by_seqnum(
1824 direction_t direction,
1828 return generic_array_bisect(f,
1829 le64toh(f->header->entry_array_offset),
1830 le64toh(f->header->n_entries),
1837 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1844 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1848 if (le64toh(o->entry.realtime) == needle)
1850 else if (le64toh(o->entry.realtime) < needle)
1856 int journal_file_move_to_entry_by_realtime(
1859 direction_t direction,
1863 return generic_array_bisect(f,
1864 le64toh(f->header->entry_array_offset),
1865 le64toh(f->header->n_entries),
1867 test_object_realtime,
1872 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1879 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1883 if (le64toh(o->entry.monotonic) == needle)
1885 else if (le64toh(o->entry.monotonic) < needle)
1891 static inline int find_data_object_by_boot_id(
1896 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1898 sd_id128_to_string(boot_id, t + 9);
1899 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1902 int journal_file_move_to_entry_by_monotonic(
1906 direction_t direction,
1915 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1921 return generic_array_bisect_plus_one(f,
1922 le64toh(o->data.entry_offset),
1923 le64toh(o->data.entry_array_offset),
1924 le64toh(o->data.n_entries),
1926 test_object_monotonic,
1931 void journal_file_reset_location(JournalFile *f) {
1932 f->location_type = LOCATION_HEAD;
1933 f->current_offset = 0;
1934 f->current_seqnum = 0;
1935 f->current_realtime = 0;
1936 f->current_monotonic = 0;
1937 zero(f->current_boot_id);
1938 f->current_xor_hash = 0;
1941 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1942 f->last_direction = direction;
1943 f->location_type = LOCATION_SEEK;
1944 f->current_offset = offset;
1945 f->current_seqnum = le64toh(o->entry.seqnum);
1946 f->current_realtime = le64toh(o->entry.realtime);
1947 f->current_monotonic = le64toh(o->entry.monotonic);
1948 f->current_boot_id = o->entry.boot_id;
1949 f->current_xor_hash = le64toh(o->entry.xor_hash);
1952 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1955 assert(af->location_type == LOCATION_SEEK);
1956 assert(bf->location_type == LOCATION_SEEK);
1958 /* If contents and timestamps match, these entries are
1959 * identical, even if the seqnum does not match */
1960 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1961 af->current_monotonic == bf->current_monotonic &&
1962 af->current_realtime == bf->current_realtime &&
1963 af->current_xor_hash == bf->current_xor_hash)
1966 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1968 /* If this is from the same seqnum source, compare
1970 if (af->current_seqnum < bf->current_seqnum)
1972 if (af->current_seqnum > bf->current_seqnum)
1975 /* Wow! This is weird, different data but the same
1976 * seqnums? Something is borked, but let's make the
1977 * best of it and compare by time. */
1980 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
1982 /* If the boot id matches, compare monotonic time */
1983 if (af->current_monotonic < bf->current_monotonic)
1985 if (af->current_monotonic > bf->current_monotonic)
1989 /* Otherwise, compare UTC time */
1990 if (af->current_realtime < bf->current_realtime)
1992 if (af->current_realtime > bf->current_realtime)
1995 /* Finally, compare by contents */
1996 if (af->current_xor_hash < bf->current_xor_hash)
1998 if (af->current_xor_hash > bf->current_xor_hash)
2004 int journal_file_next_entry(
2006 Object *o, uint64_t p,
2007 direction_t direction,
2008 Object **ret, uint64_t *offset) {
2014 assert(p > 0 || !o);
2016 n = le64toh(f->header->n_entries);
2021 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2023 if (o->object.type != OBJECT_ENTRY)
2026 r = generic_array_bisect(f,
2027 le64toh(f->header->entry_array_offset),
2028 le64toh(f->header->n_entries),
2037 if (direction == DIRECTION_DOWN) {
2050 /* And jump to it */
2051 r = generic_array_get(f,
2052 le64toh(f->header->entry_array_offset),
2059 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2060 log_debug("%s: entry array corrupted at entry %"PRIu64,
2071 int journal_file_next_entry_for_data(
2073 Object *o, uint64_t p,
2074 uint64_t data_offset,
2075 direction_t direction,
2076 Object **ret, uint64_t *offset) {
2083 assert(p > 0 || !o);
2085 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2089 n = le64toh(d->data.n_entries);
2094 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2096 if (o->object.type != OBJECT_ENTRY)
2099 r = generic_array_bisect_plus_one(f,
2100 le64toh(d->data.entry_offset),
2101 le64toh(d->data.entry_array_offset),
2102 le64toh(d->data.n_entries),
2112 if (direction == DIRECTION_DOWN) {
2126 return generic_array_get_plus_one(f,
2127 le64toh(d->data.entry_offset),
2128 le64toh(d->data.entry_array_offset),
2133 int journal_file_move_to_entry_by_offset_for_data(
2135 uint64_t data_offset,
2137 direction_t direction,
2138 Object **ret, uint64_t *offset) {
2145 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2149 return generic_array_bisect_plus_one(f,
2150 le64toh(d->data.entry_offset),
2151 le64toh(d->data.entry_array_offset),
2152 le64toh(d->data.n_entries),
2159 int journal_file_move_to_entry_by_monotonic_for_data(
2161 uint64_t data_offset,
2164 direction_t direction,
2165 Object **ret, uint64_t *offset) {
2173 /* First, seek by time */
2174 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2180 r = generic_array_bisect_plus_one(f,
2181 le64toh(o->data.entry_offset),
2182 le64toh(o->data.entry_array_offset),
2183 le64toh(o->data.n_entries),
2185 test_object_monotonic,
2191 /* And now, continue seeking until we find an entry that
2192 * exists in both bisection arrays */
2198 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2202 r = generic_array_bisect_plus_one(f,
2203 le64toh(d->data.entry_offset),
2204 le64toh(d->data.entry_array_offset),
2205 le64toh(d->data.n_entries),
2213 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2217 r = generic_array_bisect_plus_one(f,
2218 le64toh(o->data.entry_offset),
2219 le64toh(o->data.entry_array_offset),
2220 le64toh(o->data.n_entries),
2242 int journal_file_move_to_entry_by_seqnum_for_data(
2244 uint64_t data_offset,
2246 direction_t direction,
2247 Object **ret, uint64_t *offset) {
2254 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2258 return generic_array_bisect_plus_one(f,
2259 le64toh(d->data.entry_offset),
2260 le64toh(d->data.entry_array_offset),
2261 le64toh(d->data.n_entries),
2268 int journal_file_move_to_entry_by_realtime_for_data(
2270 uint64_t data_offset,
2272 direction_t direction,
2273 Object **ret, uint64_t *offset) {
2280 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2284 return generic_array_bisect_plus_one(f,
2285 le64toh(d->data.entry_offset),
2286 le64toh(d->data.entry_array_offset),
2287 le64toh(d->data.n_entries),
2289 test_object_realtime,
2294 void journal_file_dump(JournalFile *f) {
2301 journal_file_print_header(f);
2303 p = le64toh(f->header->header_size);
2305 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2309 switch (o->object.type) {
2312 printf("Type: OBJECT_UNUSED\n");
2316 printf("Type: OBJECT_DATA\n");
2320 printf("Type: OBJECT_FIELD\n");
2324 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2325 le64toh(o->entry.seqnum),
2326 le64toh(o->entry.monotonic),
2327 le64toh(o->entry.realtime));
2330 case OBJECT_FIELD_HASH_TABLE:
2331 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2334 case OBJECT_DATA_HASH_TABLE:
2335 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2338 case OBJECT_ENTRY_ARRAY:
2339 printf("Type: OBJECT_ENTRY_ARRAY\n");
2343 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2344 le64toh(o->tag.seqnum),
2345 le64toh(o->tag.epoch));
2349 printf("Type: unknown (%u)\n", o->object.type);
2353 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2354 printf("Flags: %s\n",
2355 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2357 if (p == le64toh(f->header->tail_object_offset))
2360 p = p + ALIGN64(le64toh(o->object.size));
2365 log_error("File corrupt");
2368 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2371 x = format_timestamp(buf, l, t);
2377 void journal_file_print_header(JournalFile *f) {
2378 char a[33], b[33], c[33], d[33];
2379 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2381 char bytes[FORMAT_BYTES_MAX];
2385 printf("File Path: %s\n"
2389 "Sequential Number ID: %s\n"
2391 "Compatible Flags:%s%s\n"
2392 "Incompatible Flags:%s%s%s\n"
2393 "Header size: %"PRIu64"\n"
2394 "Arena size: %"PRIu64"\n"
2395 "Data Hash Table Size: %"PRIu64"\n"
2396 "Field Hash Table Size: %"PRIu64"\n"
2397 "Rotate Suggested: %s\n"
2398 "Head Sequential Number: %"PRIu64"\n"
2399 "Tail Sequential Number: %"PRIu64"\n"
2400 "Head Realtime Timestamp: %s\n"
2401 "Tail Realtime Timestamp: %s\n"
2402 "Tail Monotonic Timestamp: %s\n"
2403 "Objects: %"PRIu64"\n"
2404 "Entry Objects: %"PRIu64"\n",
2406 sd_id128_to_string(f->header->file_id, a),
2407 sd_id128_to_string(f->header->machine_id, b),
2408 sd_id128_to_string(f->header->boot_id, c),
2409 sd_id128_to_string(f->header->seqnum_id, d),
2410 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2411 f->header->state == STATE_ONLINE ? "ONLINE" :
2412 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2413 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2414 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2415 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2416 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2417 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2418 le64toh(f->header->header_size),
2419 le64toh(f->header->arena_size),
2420 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2421 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2422 yes_no(journal_file_rotate_suggested(f, 0)),
2423 le64toh(f->header->head_entry_seqnum),
2424 le64toh(f->header->tail_entry_seqnum),
2425 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2426 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2427 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2428 le64toh(f->header->n_objects),
2429 le64toh(f->header->n_entries));
2431 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2432 printf("Data Objects: %"PRIu64"\n"
2433 "Data Hash Table Fill: %.1f%%\n",
2434 le64toh(f->header->n_data),
2435 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2437 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2438 printf("Field Objects: %"PRIu64"\n"
2439 "Field Hash Table Fill: %.1f%%\n",
2440 le64toh(f->header->n_fields),
2441 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2443 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2444 printf("Tag Objects: %"PRIu64"\n",
2445 le64toh(f->header->n_tags));
2446 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2447 printf("Entry Array Objects: %"PRIu64"\n",
2448 le64toh(f->header->n_entry_arrays));
2450 if (fstat(f->fd, &st) >= 0)
2451 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2454 int journal_file_open(
2460 JournalMetrics *metrics,
2461 MMapCache *mmap_cache,
2462 JournalFile *template,
2463 JournalFile **ret) {
2467 bool newly_created = false;
2472 if ((flags & O_ACCMODE) != O_RDONLY &&
2473 (flags & O_ACCMODE) != O_RDWR)
2476 if (!endswith(fname, ".journal") &&
2477 !endswith(fname, ".journal~"))
2480 f = new0(JournalFile, 1);
2488 f->prot = prot_from_flags(flags);
2489 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2490 #if defined(HAVE_LZ4)
2491 f->compress_lz4 = compress;
2492 #elif defined(HAVE_XZ)
2493 f->compress_xz = compress;
2500 f->mmap = mmap_cache_ref(mmap_cache);
2502 f->mmap = mmap_cache_new();
2509 f->path = strdup(fname);
2515 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2516 if (!f->chain_cache) {
2521 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2527 if (fstat(f->fd, &f->last_stat) < 0) {
2532 if (f->last_stat.st_size == 0 && f->writable) {
2535 /* Let's attach the creation time to the journal file,
2536 * so that the vacuuming code knows the age of this
2537 * file even if the file might end up corrupted one
2538 * day... Ideally we'd just use the creation time many
2539 * file systems maintain for each file, but there is
2540 * currently no usable API to query this, hence let's
2541 * emulate this via extended attributes. If extended
2542 * attributes are not supported we'll just skip this,
2543 * and rely solely on mtime/atime/ctime of the file. */
2545 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2546 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2549 /* Try to load the FSPRG state, and if we can't, then
2550 * just don't do sealing */
2552 r = journal_file_fss_load(f);
2558 r = journal_file_init_header(f, template);
2562 if (fstat(f->fd, &f->last_stat) < 0) {
2567 newly_created = true;
2570 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2575 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2576 if (f->header == MAP_FAILED) {
2582 if (!newly_created) {
2583 r = journal_file_verify_header(f);
2589 if (!newly_created && f->writable) {
2590 r = journal_file_fss_load(f);
2598 journal_default_metrics(metrics, f->fd);
2599 f->metrics = *metrics;
2600 } else if (template)
2601 f->metrics = template->metrics;
2603 r = journal_file_refresh_header(f);
2609 r = journal_file_hmac_setup(f);
2614 if (newly_created) {
2615 r = journal_file_setup_field_hash_table(f);
2619 r = journal_file_setup_data_hash_table(f);
2624 r = journal_file_append_first_tag(f);
2630 r = journal_file_map_field_hash_table(f);
2634 r = journal_file_map_data_hash_table(f);
2642 journal_file_close(f);
2647 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2648 _cleanup_free_ char *p = NULL;
2650 JournalFile *old_file, *new_file = NULL;
2658 if (!old_file->writable)
2661 if (!endswith(old_file->path, ".journal"))
2664 l = strlen(old_file->path);
2665 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2666 (int) l - 8, old_file->path,
2667 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2668 le64toh((*f)->header->head_entry_seqnum),
2669 le64toh((*f)->header->head_entry_realtime));
2673 r = rename(old_file->path, p);
2677 old_file->header->state = STATE_ARCHIVED;
2679 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2680 journal_file_close(old_file);
2686 int journal_file_open_reliably(
2692 JournalMetrics *metrics,
2693 MMapCache *mmap_cache,
2694 JournalFile *template,
2695 JournalFile **ret) {
2699 _cleanup_free_ char *p = NULL;
2701 r = journal_file_open(fname, flags, mode, compress, seal,
2702 metrics, mmap_cache, template, ret);
2703 if (r != -EBADMSG && /* corrupted */
2704 r != -ENODATA && /* truncated */
2705 r != -EHOSTDOWN && /* other machine */
2706 r != -EPROTONOSUPPORT && /* incompatible feature */
2707 r != -EBUSY && /* unclean shutdown */
2708 r != -ESHUTDOWN /* already archived */)
2711 if ((flags & O_ACCMODE) == O_RDONLY)
2714 if (!(flags & O_CREAT))
2717 if (!endswith(fname, ".journal"))
2720 /* The file is corrupted. Rotate it away and try it again (but only once) */
2723 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2725 (unsigned long long) now(CLOCK_REALTIME),
2729 r = rename(fname, p);
2733 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2735 return journal_file_open(fname, flags, mode, compress, seal,
2736 metrics, mmap_cache, template, ret);
2739 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2741 uint64_t q, xor_hash = 0;
2754 ts.monotonic = le64toh(o->entry.monotonic);
2755 ts.realtime = le64toh(o->entry.realtime);
2757 n = journal_file_entry_n_items(o);
2758 /* alloca() can't take 0, hence let's allocate at least one */
2759 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2761 for (i = 0; i < n; i++) {
2768 q = le64toh(o->entry.items[i].object_offset);
2769 le_hash = o->entry.items[i].hash;
2771 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2775 if (le_hash != o->data.hash)
2778 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2781 /* We hit the limit on 32bit machines */
2782 if ((uint64_t) t != l)
2785 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2786 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2789 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2790 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2794 data = from->compress_buffer;
2797 return -EPROTONOSUPPORT;
2800 data = o->data.payload;
2802 r = journal_file_append_data(to, data, l, &u, &h);
2806 xor_hash ^= le64toh(u->data.hash);
2807 items[i].object_offset = htole64(h);
2808 items[i].hash = u->data.hash;
2810 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2815 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2818 void journal_default_metrics(JournalMetrics *m, int fd) {
2819 uint64_t fs_size = 0;
2821 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2826 if (fstatvfs(fd, &ss) >= 0)
2827 fs_size = ss.f_frsize * ss.f_blocks;
2829 if (m->max_use == (uint64_t) -1) {
2832 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2834 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2835 m->max_use = DEFAULT_MAX_USE_UPPER;
2837 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2838 m->max_use = DEFAULT_MAX_USE_LOWER;
2840 m->max_use = DEFAULT_MAX_USE_LOWER;
2842 m->max_use = PAGE_ALIGN(m->max_use);
2844 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2845 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2848 if (m->max_size == (uint64_t) -1) {
2849 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2851 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2852 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2854 m->max_size = PAGE_ALIGN(m->max_size);
2856 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2857 m->max_size = JOURNAL_FILE_SIZE_MIN;
2859 if (m->max_size*2 > m->max_use)
2860 m->max_use = m->max_size*2;
2862 if (m->min_size == (uint64_t) -1)
2863 m->min_size = JOURNAL_FILE_SIZE_MIN;
2865 m->min_size = PAGE_ALIGN(m->min_size);
2867 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2868 m->min_size = JOURNAL_FILE_SIZE_MIN;
2870 if (m->min_size > m->max_size)
2871 m->max_size = m->min_size;
2874 if (m->keep_free == (uint64_t) -1) {
2877 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2879 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2880 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2883 m->keep_free = DEFAULT_KEEP_FREE;
2886 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2887 format_bytes(a, sizeof(a), m->max_use),
2888 format_bytes(b, sizeof(b), m->max_size),
2889 format_bytes(c, sizeof(c), m->min_size),
2890 format_bytes(d, sizeof(d), m->keep_free));
2893 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2898 if (f->header->head_entry_realtime == 0)
2901 *from = le64toh(f->header->head_entry_realtime);
2905 if (f->header->tail_entry_realtime == 0)
2908 *to = le64toh(f->header->tail_entry_realtime);
2914 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2922 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2926 if (le64toh(o->data.n_entries) <= 0)
2930 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2934 *from = le64toh(o->entry.monotonic);
2938 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2942 r = generic_array_get_plus_one(f,
2943 le64toh(o->data.entry_offset),
2944 le64toh(o->data.entry_array_offset),
2945 le64toh(o->data.n_entries)-1,
2950 *to = le64toh(o->entry.monotonic);
2956 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2959 /* If we gained new header fields we gained new features,
2960 * hence suggest a rotation */
2961 if (le64toh(f->header->header_size) < sizeof(Header)) {
2962 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2966 /* Let's check if the hash tables grew over a certain fill
2967 * level (75%, borrowing this value from Java's hash table
2968 * implementation), and if so suggest a rotation. To calculate
2969 * the fill level we need the n_data field, which only exists
2970 * in newer versions. */
2972 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2973 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2974 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2976 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2977 le64toh(f->header->n_data),
2978 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2979 (unsigned long long) f->last_stat.st_size,
2980 f->last_stat.st_size / le64toh(f->header->n_data));
2984 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2985 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2986 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2988 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2989 le64toh(f->header->n_fields),
2990 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2994 /* Are the data objects properly indexed by field objects? */
2995 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2996 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2997 le64toh(f->header->n_data) > 0 &&
2998 le64toh(f->header->n_fields) == 0)
3001 if (max_file_usec > 0) {
3004 h = le64toh(f->header->head_entry_realtime);
3005 t = now(CLOCK_REALTIME);
3007 if (h > 0 && t > h + max_file_usec)