1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
47 /* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
52 /* This is the upper bound if we deduce max_size from max_use */
53 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
55 /* This is the upper bound if we deduce the keep_free value from the
57 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
59 /* This is the keep_free value when we can't determine the system
61 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
63 /* n_data was the first entry we added after the initial file format design */
64 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
66 /* How many entries to keep in the entry array chain cache at max */
67 #define CHAIN_CACHE_MAX 20
69 /* How much to increase the journal file size at once each time we allocate something new. */
70 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
72 /* Reread fstat() of the file for detecting deletions at least this often */
73 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
75 /* The mmap context to use for the header we pick as one above the last defined typed */
76 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
78 static int journal_file_set_online(JournalFile *f) {
84 if (!(f->fd >= 0 && f->header))
87 if (mmap_cache_got_sigbus(f->mmap, f->fd))
90 switch(f->header->state) {
95 f->header->state = STATE_ONLINE;
104 int journal_file_set_offline(JournalFile *f) {
110 if (!(f->fd >= 0 && f->header))
113 if (f->header->state != STATE_ONLINE)
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
121 f->header->state = STATE_OFFLINE;
123 if (mmap_cache_got_sigbus(f->mmap, f->fd))
131 void journal_file_close(JournalFile *f) {
135 /* Write the final tag */
136 if (f->seal && f->writable)
137 journal_file_append_tag(f);
140 journal_file_set_offline(f);
142 if (f->mmap && f->fd >= 0)
143 mmap_cache_close_fd(f->mmap, f->fd);
145 if (f->fd >= 0 && f->defrag_on_close) {
147 /* Be friendly to btrfs: turn COW back on again now,
148 * and defragment the file. We won't write to the file
149 * ever again, hence remove all fragmentation, and
150 * reenable all the good bits COW usually provides
151 * (such as data checksumming). */
153 (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
154 (void) btrfs_defrag_fd(f->fd);
161 mmap_cache_unref(f->mmap);
163 ordered_hashmap_free_free(f->chain_cache);
165 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
166 free(f->compress_buffer);
171 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
172 else if (f->fsprg_state)
173 free(f->fsprg_state);
178 gcry_md_close(f->hmac);
184 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
191 memcpy(h.signature, HEADER_SIGNATURE, 8);
192 h.header_size = htole64(ALIGN64(sizeof(h)));
194 h.incompatible_flags |= htole32(
195 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
196 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
198 h.compatible_flags = htole32(
199 f->seal * HEADER_COMPATIBLE_SEALED);
201 r = sd_id128_randomize(&h.file_id);
206 h.seqnum_id = template->header->seqnum_id;
207 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
209 h.seqnum_id = h.file_id;
211 k = pwrite(f->fd, &h, sizeof(h), 0);
221 static int journal_file_refresh_header(JournalFile *f) {
227 r = sd_id128_get_machine(&f->header->machine_id);
231 r = sd_id128_get_boot(&boot_id);
235 if (sd_id128_equal(boot_id, f->header->boot_id))
236 f->tail_entry_monotonic_valid = true;
238 f->header->boot_id = boot_id;
240 r = journal_file_set_online(f);
242 /* Sync the online state to disk */
248 static int journal_file_verify_header(JournalFile *f) {
253 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
256 /* In both read and write mode we refuse to open files with
257 * incompatible flags we don't know */
258 flags = le32toh(f->header->incompatible_flags);
259 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
260 if (flags & ~HEADER_INCOMPATIBLE_ANY)
261 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
262 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
263 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
265 log_debug("Journal file %s uses incompatible flags %"PRIx32
266 " disabled at compilation time.", f->path, flags);
267 return -EPROTONOSUPPORT;
270 /* When open for writing we refuse to open files with
271 * compatible flags, too */
272 flags = le32toh(f->header->compatible_flags);
273 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
274 if (flags & ~HEADER_COMPATIBLE_ANY)
275 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
276 f->path, flags & ~HEADER_COMPATIBLE_ANY);
277 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
279 log_debug("Journal file %s uses compatible flags %"PRIx32
280 " disabled at compilation time.", f->path, flags);
281 return -EPROTONOSUPPORT;
284 if (f->header->state >= _STATE_MAX)
287 /* The first addition was n_data, so check that we are at least this large */
288 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
291 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
294 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
297 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
300 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
301 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
302 !VALID64(le64toh(f->header->tail_object_offset)) ||
303 !VALID64(le64toh(f->header->entry_array_offset)))
308 sd_id128_t machine_id;
311 r = sd_id128_get_machine(&machine_id);
315 if (!sd_id128_equal(machine_id, f->header->machine_id))
318 state = f->header->state;
320 if (state == STATE_ONLINE) {
321 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
323 } else if (state == STATE_ARCHIVED)
325 else if (state != STATE_OFFLINE) {
326 log_debug("Journal file %s has unknown state %i.", f->path, state);
331 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
332 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
334 f->seal = JOURNAL_HEADER_SEALED(f->header);
339 static int journal_file_fstat(JournalFile *f) {
343 if (fstat(f->fd, &f->last_stat) < 0)
346 f->last_stat_usec = now(CLOCK_MONOTONIC);
348 /* Refuse appending to files that are already deleted */
349 if (f->last_stat.st_nlink <= 0)
355 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
356 uint64_t old_size, new_size;
361 /* We assume that this file is not sparse, and we know that
362 * for sure, since we always call posix_fallocate()
365 if (mmap_cache_got_sigbus(f->mmap, f->fd))
369 le64toh(f->header->header_size) +
370 le64toh(f->header->arena_size);
372 new_size = PAGE_ALIGN(offset + size);
373 if (new_size < le64toh(f->header->header_size))
374 new_size = le64toh(f->header->header_size);
376 if (new_size <= old_size) {
378 /* We already pre-allocated enough space, but before
379 * we write to it, let's check with fstat() if the
380 * file got deleted, in order make sure we don't throw
381 * away the data immediately. Don't check fstat() for
382 * all writes though, but only once ever 10s. */
384 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
387 return journal_file_fstat(f);
390 /* Allocate more space. */
392 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
395 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
398 if (fstatvfs(f->fd, &svfs) >= 0) {
401 available = svfs.f_bfree * svfs.f_bsize;
403 if (available >= f->metrics.keep_free)
404 available -= f->metrics.keep_free;
408 if (new_size - old_size > available)
413 /* Increase by larger blocks at once */
414 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
415 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
416 new_size = f->metrics.max_size;
418 /* Note that the glibc fallocate() fallback is very
419 inefficient, hence we try to minimize the allocation area
421 r = posix_fallocate(f->fd, old_size, new_size - old_size);
425 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
427 return journal_file_fstat(f);
430 static unsigned type_to_context(ObjectType type) {
431 /* One context for each type, plus one catch-all for the rest */
432 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
433 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
434 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
437 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
446 /* Avoid SIGBUS on invalid accesses */
447 if (offset + size > (uint64_t) f->last_stat.st_size) {
448 /* Hmm, out of range? Let's refresh the fstat() data
449 * first, before we trust that check. */
451 r = journal_file_fstat(f);
455 if (offset + size > (uint64_t) f->last_stat.st_size)
456 return -EADDRNOTAVAIL;
459 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
462 static uint64_t minimum_header_size(Object *o) {
464 static const uint64_t table[] = {
465 [OBJECT_DATA] = sizeof(DataObject),
466 [OBJECT_FIELD] = sizeof(FieldObject),
467 [OBJECT_ENTRY] = sizeof(EntryObject),
468 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
469 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
470 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
471 [OBJECT_TAG] = sizeof(TagObject),
474 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
475 return sizeof(ObjectHeader);
477 return table[o->object.type];
480 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
489 /* Objects may only be located at multiple of 64 bit */
490 if (!VALID64(offset))
493 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
498 s = le64toh(o->object.size);
500 if (s < sizeof(ObjectHeader))
503 if (o->object.type <= OBJECT_UNUSED)
506 if (s < minimum_header_size(o))
509 if (type > OBJECT_UNUSED && o->object.type != type)
512 if (s > sizeof(ObjectHeader)) {
513 r = journal_file_move_to(f, type, false, offset, s, &t);
524 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
529 r = le64toh(f->header->tail_entry_seqnum) + 1;
532 /* If an external seqnum counter was passed, we update
533 * both the local and the external one, and set it to
534 * the maximum of both */
542 f->header->tail_entry_seqnum = htole64(r);
544 if (f->header->head_entry_seqnum == 0)
545 f->header->head_entry_seqnum = htole64(r);
550 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
557 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
558 assert(size >= sizeof(ObjectHeader));
562 r = journal_file_set_online(f);
566 p = le64toh(f->header->tail_object_offset);
568 p = le64toh(f->header->header_size);
570 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
574 p += ALIGN64(le64toh(tail->object.size));
577 r = journal_file_allocate(f, p, size);
581 r = journal_file_move_to(f, type, false, p, size, &t);
588 o->object.type = type;
589 o->object.size = htole64(size);
591 f->header->tail_object_offset = htole64(p);
592 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
600 static int journal_file_setup_data_hash_table(JournalFile *f) {
607 /* We estimate that we need 1 hash table entry per 768 of
608 journal file and we want to make sure we never get beyond
609 75% fill level. Calculate the hash table size for the
610 maximum file size based on these metrics. */
612 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
613 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
614 s = DEFAULT_DATA_HASH_TABLE_SIZE;
616 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
618 r = journal_file_append_object(f,
619 OBJECT_DATA_HASH_TABLE,
620 offsetof(Object, hash_table.items) + s,
625 memzero(o->hash_table.items, s);
627 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
628 f->header->data_hash_table_size = htole64(s);
633 static int journal_file_setup_field_hash_table(JournalFile *f) {
640 /* We use a fixed size hash table for the fields as this
641 * number should grow very slowly only */
643 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
644 r = journal_file_append_object(f,
645 OBJECT_FIELD_HASH_TABLE,
646 offsetof(Object, hash_table.items) + s,
651 memzero(o->hash_table.items, s);
653 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
654 f->header->field_hash_table_size = htole64(s);
659 static int journal_file_map_data_hash_table(JournalFile *f) {
666 p = le64toh(f->header->data_hash_table_offset);
667 s = le64toh(f->header->data_hash_table_size);
669 r = journal_file_move_to(f,
670 OBJECT_DATA_HASH_TABLE,
677 f->data_hash_table = t;
681 static int journal_file_map_field_hash_table(JournalFile *f) {
688 p = le64toh(f->header->field_hash_table_offset);
689 s = le64toh(f->header->field_hash_table_size);
691 r = journal_file_move_to(f,
692 OBJECT_FIELD_HASH_TABLE,
699 f->field_hash_table = t;
703 static int journal_file_link_field(
716 if (o->object.type != OBJECT_FIELD)
719 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
723 /* This might alter the window we are looking at */
724 o->field.next_hash_offset = o->field.head_data_offset = 0;
727 p = le64toh(f->field_hash_table[h].tail_hash_offset);
729 f->field_hash_table[h].head_hash_offset = htole64(offset);
731 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
735 o->field.next_hash_offset = htole64(offset);
738 f->field_hash_table[h].tail_hash_offset = htole64(offset);
740 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
741 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
746 static int journal_file_link_data(
759 if (o->object.type != OBJECT_DATA)
762 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
766 /* This might alter the window we are looking at */
767 o->data.next_hash_offset = o->data.next_field_offset = 0;
768 o->data.entry_offset = o->data.entry_array_offset = 0;
769 o->data.n_entries = 0;
772 p = le64toh(f->data_hash_table[h].tail_hash_offset);
774 /* Only entry in the hash table is easy */
775 f->data_hash_table[h].head_hash_offset = htole64(offset);
777 /* Move back to the previous data object, to patch in
780 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
784 o->data.next_hash_offset = htole64(offset);
787 f->data_hash_table[h].tail_hash_offset = htole64(offset);
789 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
790 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
795 int journal_file_find_field_object_with_hash(
797 const void *field, uint64_t size, uint64_t hash,
798 Object **ret, uint64_t *offset) {
800 uint64_t p, osize, h, m;
804 assert(field && size > 0);
806 osize = offsetof(Object, field.payload) + size;
808 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
814 p = le64toh(f->field_hash_table[h].head_hash_offset);
819 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
823 if (le64toh(o->field.hash) == hash &&
824 le64toh(o->object.size) == osize &&
825 memcmp(o->field.payload, field, size) == 0) {
835 p = le64toh(o->field.next_hash_offset);
841 int journal_file_find_field_object(
843 const void *field, uint64_t size,
844 Object **ret, uint64_t *offset) {
849 assert(field && size > 0);
851 hash = hash64(field, size);
853 return journal_file_find_field_object_with_hash(f,
858 int journal_file_find_data_object_with_hash(
860 const void *data, uint64_t size, uint64_t hash,
861 Object **ret, uint64_t *offset) {
863 uint64_t p, osize, h, m;
867 assert(data || size == 0);
869 osize = offsetof(Object, data.payload) + size;
871 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
876 p = le64toh(f->data_hash_table[h].head_hash_offset);
881 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
885 if (le64toh(o->data.hash) != hash)
888 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
889 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
893 l = le64toh(o->object.size);
894 if (l <= offsetof(Object, data.payload))
897 l -= offsetof(Object, data.payload);
899 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
900 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
905 memcmp(f->compress_buffer, data, size) == 0) {
916 return -EPROTONOSUPPORT;
918 } else if (le64toh(o->object.size) == osize &&
919 memcmp(o->data.payload, data, size) == 0) {
931 p = le64toh(o->data.next_hash_offset);
937 int journal_file_find_data_object(
939 const void *data, uint64_t size,
940 Object **ret, uint64_t *offset) {
945 assert(data || size == 0);
947 hash = hash64(data, size);
949 return journal_file_find_data_object_with_hash(f,
954 static int journal_file_append_field(
956 const void *field, uint64_t size,
957 Object **ret, uint64_t *offset) {
965 assert(field && size > 0);
967 hash = hash64(field, size);
969 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
983 osize = offsetof(Object, field.payload) + size;
984 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
988 o->field.hash = htole64(hash);
989 memcpy(o->field.payload, field, size);
991 r = journal_file_link_field(f, o, p, hash);
995 /* The linking might have altered the window, so let's
996 * refresh our pointer */
997 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1002 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1016 static int journal_file_append_data(
1018 const void *data, uint64_t size,
1019 Object **ret, uint64_t *offset) {
1024 int r, compression = 0;
1028 assert(data || size == 0);
1030 hash = hash64(data, size);
1032 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1046 osize = offsetof(Object, data.payload) + size;
1047 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1051 o->data.hash = htole64(hash);
1053 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1054 if (f->compress_xz &&
1055 size >= COMPRESSION_SIZE_THRESHOLD) {
1058 compression = compress_blob(data, size, o->data.payload, &rsize);
1061 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1062 o->object.flags |= compression;
1064 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1065 size, rsize, object_compressed_to_string(compression));
1070 if (!compression && size > 0)
1071 memcpy(o->data.payload, data, size);
1073 r = journal_file_link_data(f, o, p, hash);
1077 /* The linking might have altered the window, so let's
1078 * refresh our pointer */
1079 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1086 eq = memchr(data, '=', size);
1087 if (eq && eq > data) {
1091 /* Create field object ... */
1092 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1096 /* ... and link it in. */
1097 o->data.next_field_offset = fo->field.head_data_offset;
1098 fo->field.head_data_offset = le64toh(p);
1102 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1116 uint64_t journal_file_entry_n_items(Object *o) {
1119 if (o->object.type != OBJECT_ENTRY)
1122 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1125 uint64_t journal_file_entry_array_n_items(Object *o) {
1128 if (o->object.type != OBJECT_ENTRY_ARRAY)
1131 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1134 uint64_t journal_file_hash_table_n_items(Object *o) {
1137 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1138 o->object.type != OBJECT_FIELD_HASH_TABLE)
1141 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1144 static int link_entry_into_array(JournalFile *f,
1149 uint64_t n = 0, ap = 0, q, i, a, hidx;
1157 a = le64toh(*first);
1158 i = hidx = le64toh(*idx);
1161 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1165 n = journal_file_entry_array_n_items(o);
1167 o->entry_array.items[i] = htole64(p);
1168 *idx = htole64(hidx + 1);
1174 a = le64toh(o->entry_array.next_entry_array_offset);
1185 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1186 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1192 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1197 o->entry_array.items[i] = htole64(p);
1200 *first = htole64(q);
1202 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1206 o->entry_array.next_entry_array_offset = htole64(q);
1209 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1210 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1212 *idx = htole64(hidx + 1);
1217 static int link_entry_into_array_plus_one(JournalFile *f,
1232 *extra = htole64(p);
1236 i = htole64(le64toh(*idx) - 1);
1237 r = link_entry_into_array(f, first, &i, p);
1242 *idx = htole64(le64toh(*idx) + 1);
1246 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1253 p = le64toh(o->entry.items[i].object_offset);
1257 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1261 return link_entry_into_array_plus_one(f,
1262 &o->data.entry_offset,
1263 &o->data.entry_array_offset,
1268 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1276 if (o->object.type != OBJECT_ENTRY)
1279 __sync_synchronize();
1281 /* Link up the entry itself */
1282 r = link_entry_into_array(f,
1283 &f->header->entry_array_offset,
1284 &f->header->n_entries,
1289 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1291 if (f->header->head_entry_realtime == 0)
1292 f->header->head_entry_realtime = o->entry.realtime;
1294 f->header->tail_entry_realtime = o->entry.realtime;
1295 f->header->tail_entry_monotonic = o->entry.monotonic;
1297 f->tail_entry_monotonic_valid = true;
1299 /* Link up the items */
1300 n = journal_file_entry_n_items(o);
1301 for (i = 0; i < n; i++) {
1302 r = journal_file_link_entry_item(f, o, offset, i);
1310 static int journal_file_append_entry_internal(
1312 const dual_timestamp *ts,
1314 const EntryItem items[], unsigned n_items,
1316 Object **ret, uint64_t *offset) {
1323 assert(items || n_items == 0);
1326 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1328 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1332 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1333 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1334 o->entry.realtime = htole64(ts->realtime);
1335 o->entry.monotonic = htole64(ts->monotonic);
1336 o->entry.xor_hash = htole64(xor_hash);
1337 o->entry.boot_id = f->header->boot_id;
1340 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1345 r = journal_file_link_entry(f, o, np);
1358 void journal_file_post_change(JournalFile *f) {
1361 /* inotify() does not receive IN_MODIFY events from file
1362 * accesses done via mmap(). After each access we hence
1363 * trigger IN_MODIFY by truncating the journal file to its
1364 * current size which triggers IN_MODIFY. */
1366 __sync_synchronize();
1368 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1369 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1372 static int entry_item_cmp(const void *_a, const void *_b) {
1373 const EntryItem *a = _a, *b = _b;
1375 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1377 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1382 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1386 uint64_t xor_hash = 0;
1387 struct dual_timestamp _ts;
1390 assert(iovec || n_iovec == 0);
1393 dual_timestamp_get(&_ts);
1397 if (f->tail_entry_monotonic_valid &&
1398 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1402 r = journal_file_maybe_append_tag(f, ts->realtime);
1407 /* alloca() can't take 0, hence let's allocate at least one */
1408 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1410 for (i = 0; i < n_iovec; i++) {
1414 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1418 xor_hash ^= le64toh(o->data.hash);
1419 items[i].object_offset = htole64(p);
1420 items[i].hash = o->data.hash;
1423 /* Order by the position on disk, in order to improve seek
1424 * times for rotating media. */
1425 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1427 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1429 /* If the memory mapping triggered a SIGBUS then we return an
1430 * IO error and ignore the error code passed down to us, since
1431 * it is very likely just an effect of a nullified replacement
1434 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1437 journal_file_post_change(f);
1442 typedef struct ChainCacheItem {
1443 uint64_t first; /* the array at the beginning of the chain */
1444 uint64_t array; /* the cached array */
1445 uint64_t begin; /* the first item in the cached array */
1446 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1447 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1450 static void chain_cache_put(
1457 uint64_t last_index) {
1460 /* If the chain item to cache for this chain is the
1461 * first one it's not worth caching anything */
1465 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1466 ci = ordered_hashmap_steal_first(h);
1469 ci = new(ChainCacheItem, 1);
1476 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1481 assert(ci->first == first);
1486 ci->last_index = last_index;
1489 static int generic_array_get(
1493 Object **ret, uint64_t *offset) {
1496 uint64_t p = 0, a, t = 0;
1504 /* Try the chain cache first */
1505 ci = ordered_hashmap_get(f->chain_cache, &first);
1506 if (ci && i > ci->total) {
1515 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1519 k = journal_file_entry_array_n_items(o);
1521 p = le64toh(o->entry_array.items[i]);
1527 a = le64toh(o->entry_array.next_entry_array_offset);
1533 /* Let's cache this item for the next invocation */
1534 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1536 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1549 static int generic_array_get_plus_one(
1554 Object **ret, uint64_t *offset) {
1563 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1576 return generic_array_get(f, first, i-1, ret, offset);
1585 static int generic_array_bisect(
1590 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1591 direction_t direction,
1596 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1597 bool subtract_one = false;
1598 Object *o, *array = NULL;
1603 assert(test_object);
1605 /* Start with the first array in the chain */
1608 ci = ordered_hashmap_get(f->chain_cache, &first);
1609 if (ci && n > ci->total) {
1610 /* Ah, we have iterated this bisection array chain
1611 * previously! Let's see if we can skip ahead in the
1612 * chain, as far as the last time. But we can't jump
1613 * backwards in the chain, so let's check that
1616 r = test_object(f, ci->begin, needle);
1620 if (r == TEST_LEFT) {
1621 /* OK, what we are looking for is right of the
1622 * begin of this EntryArray, so let's jump
1623 * straight to previously cached array in the
1629 last_index = ci->last_index;
1634 uint64_t left, right, k, lp;
1636 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1640 k = journal_file_entry_array_n_items(array);
1646 lp = p = le64toh(array->entry_array.items[i]);
1650 r = test_object(f, p, needle);
1654 if (r == TEST_FOUND)
1655 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1657 if (r == TEST_RIGHT) {
1661 if (last_index != (uint64_t) -1) {
1662 assert(last_index <= right);
1664 /* If we cached the last index we
1665 * looked at, let's try to not to jump
1666 * too wildly around and see if we can
1667 * limit the range to look at early to
1668 * the immediate neighbors of the last
1669 * index we looked at. */
1671 if (last_index > 0) {
1672 uint64_t x = last_index - 1;
1674 p = le64toh(array->entry_array.items[x]);
1678 r = test_object(f, p, needle);
1682 if (r == TEST_FOUND)
1683 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1685 if (r == TEST_RIGHT)
1691 if (last_index < right) {
1692 uint64_t y = last_index + 1;
1694 p = le64toh(array->entry_array.items[y]);
1698 r = test_object(f, p, needle);
1702 if (r == TEST_FOUND)
1703 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1705 if (r == TEST_RIGHT)
1713 if (left == right) {
1714 if (direction == DIRECTION_UP)
1715 subtract_one = true;
1721 assert(left < right);
1722 i = (left + right) / 2;
1724 p = le64toh(array->entry_array.items[i]);
1728 r = test_object(f, p, needle);
1732 if (r == TEST_FOUND)
1733 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1735 if (r == TEST_RIGHT)
1743 if (direction == DIRECTION_UP) {
1745 subtract_one = true;
1756 last_index = (uint64_t) -1;
1757 a = le64toh(array->entry_array.next_entry_array_offset);
1763 if (subtract_one && t == 0 && i == 0)
1766 /* Let's cache this item for the next invocation */
1767 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1769 if (subtract_one && i == 0)
1771 else if (subtract_one)
1772 p = le64toh(array->entry_array.items[i-1]);
1774 p = le64toh(array->entry_array.items[i]);
1776 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1787 *idx = t + i + (subtract_one ? -1 : 0);
1792 static int generic_array_bisect_plus_one(
1798 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1799 direction_t direction,
1805 bool step_back = false;
1809 assert(test_object);
1814 /* This bisects the array in object 'first', but first checks
1816 r = test_object(f, extra, needle);
1820 if (r == TEST_FOUND)
1821 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1823 /* if we are looking with DIRECTION_UP then we need to first
1824 see if in the actual array there is a matching entry, and
1825 return the last one of that. But if there isn't any we need
1826 to return this one. Hence remember this, and return it
1829 step_back = direction == DIRECTION_UP;
1831 if (r == TEST_RIGHT) {
1832 if (direction == DIRECTION_DOWN)
1838 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1840 if (r == 0 && step_back)
1849 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1865 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1871 else if (p < needle)
1877 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1884 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1888 if (le64toh(o->entry.seqnum) == needle)
1890 else if (le64toh(o->entry.seqnum) < needle)
1896 int journal_file_move_to_entry_by_seqnum(
1899 direction_t direction,
1903 return generic_array_bisect(f,
1904 le64toh(f->header->entry_array_offset),
1905 le64toh(f->header->n_entries),
1912 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1919 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1923 if (le64toh(o->entry.realtime) == needle)
1925 else if (le64toh(o->entry.realtime) < needle)
1931 int journal_file_move_to_entry_by_realtime(
1934 direction_t direction,
1938 return generic_array_bisect(f,
1939 le64toh(f->header->entry_array_offset),
1940 le64toh(f->header->n_entries),
1942 test_object_realtime,
1947 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1954 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1958 if (le64toh(o->entry.monotonic) == needle)
1960 else if (le64toh(o->entry.monotonic) < needle)
1966 static int find_data_object_by_boot_id(
1972 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1974 sd_id128_to_string(boot_id, t + 9);
1975 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1978 int journal_file_move_to_entry_by_monotonic(
1982 direction_t direction,
1991 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1997 return generic_array_bisect_plus_one(f,
1998 le64toh(o->data.entry_offset),
1999 le64toh(o->data.entry_array_offset),
2000 le64toh(o->data.n_entries),
2002 test_object_monotonic,
2007 void journal_file_reset_location(JournalFile *f) {
2008 f->location_type = LOCATION_HEAD;
2009 f->current_offset = 0;
2010 f->current_seqnum = 0;
2011 f->current_realtime = 0;
2012 f->current_monotonic = 0;
2013 zero(f->current_boot_id);
2014 f->current_xor_hash = 0;
2017 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2018 f->last_direction = direction;
2019 f->location_type = LOCATION_SEEK;
2020 f->current_offset = offset;
2021 f->current_seqnum = le64toh(o->entry.seqnum);
2022 f->current_realtime = le64toh(o->entry.realtime);
2023 f->current_monotonic = le64toh(o->entry.monotonic);
2024 f->current_boot_id = o->entry.boot_id;
2025 f->current_xor_hash = le64toh(o->entry.xor_hash);
2028 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2031 assert(af->location_type == LOCATION_SEEK);
2032 assert(bf->location_type == LOCATION_SEEK);
2034 /* If contents and timestamps match, these entries are
2035 * identical, even if the seqnum does not match */
2036 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2037 af->current_monotonic == bf->current_monotonic &&
2038 af->current_realtime == bf->current_realtime &&
2039 af->current_xor_hash == bf->current_xor_hash)
2042 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2044 /* If this is from the same seqnum source, compare
2046 if (af->current_seqnum < bf->current_seqnum)
2048 if (af->current_seqnum > bf->current_seqnum)
2051 /* Wow! This is weird, different data but the same
2052 * seqnums? Something is borked, but let's make the
2053 * best of it and compare by time. */
2056 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2058 /* If the boot id matches, compare monotonic time */
2059 if (af->current_monotonic < bf->current_monotonic)
2061 if (af->current_monotonic > bf->current_monotonic)
2065 /* Otherwise, compare UTC time */
2066 if (af->current_realtime < bf->current_realtime)
2068 if (af->current_realtime > bf->current_realtime)
2071 /* Finally, compare by contents */
2072 if (af->current_xor_hash < bf->current_xor_hash)
2074 if (af->current_xor_hash > bf->current_xor_hash)
2080 int journal_file_next_entry(
2083 direction_t direction,
2084 Object **ret, uint64_t *offset) {
2091 n = le64toh(f->header->n_entries);
2096 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2098 r = generic_array_bisect(f,
2099 le64toh(f->header->entry_array_offset),
2100 le64toh(f->header->n_entries),
2109 if (direction == DIRECTION_DOWN) {
2122 /* And jump to it */
2123 r = generic_array_get(f,
2124 le64toh(f->header->entry_array_offset),
2131 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2132 log_debug("%s: entry array corrupted at entry %"PRIu64,
2143 int journal_file_next_entry_for_data(
2145 Object *o, uint64_t p,
2146 uint64_t data_offset,
2147 direction_t direction,
2148 Object **ret, uint64_t *offset) {
2155 assert(p > 0 || !o);
2157 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2161 n = le64toh(d->data.n_entries);
2166 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2168 if (o->object.type != OBJECT_ENTRY)
2171 r = generic_array_bisect_plus_one(f,
2172 le64toh(d->data.entry_offset),
2173 le64toh(d->data.entry_array_offset),
2174 le64toh(d->data.n_entries),
2184 if (direction == DIRECTION_DOWN) {
2198 return generic_array_get_plus_one(f,
2199 le64toh(d->data.entry_offset),
2200 le64toh(d->data.entry_array_offset),
2205 int journal_file_move_to_entry_by_offset_for_data(
2207 uint64_t data_offset,
2209 direction_t direction,
2210 Object **ret, uint64_t *offset) {
2217 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2221 return generic_array_bisect_plus_one(f,
2222 le64toh(d->data.entry_offset),
2223 le64toh(d->data.entry_array_offset),
2224 le64toh(d->data.n_entries),
2231 int journal_file_move_to_entry_by_monotonic_for_data(
2233 uint64_t data_offset,
2236 direction_t direction,
2237 Object **ret, uint64_t *offset) {
2245 /* First, seek by time */
2246 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2252 r = generic_array_bisect_plus_one(f,
2253 le64toh(o->data.entry_offset),
2254 le64toh(o->data.entry_array_offset),
2255 le64toh(o->data.n_entries),
2257 test_object_monotonic,
2263 /* And now, continue seeking until we find an entry that
2264 * exists in both bisection arrays */
2270 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2274 r = generic_array_bisect_plus_one(f,
2275 le64toh(d->data.entry_offset),
2276 le64toh(d->data.entry_array_offset),
2277 le64toh(d->data.n_entries),
2285 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2289 r = generic_array_bisect_plus_one(f,
2290 le64toh(o->data.entry_offset),
2291 le64toh(o->data.entry_array_offset),
2292 le64toh(o->data.n_entries),
2314 int journal_file_move_to_entry_by_seqnum_for_data(
2316 uint64_t data_offset,
2318 direction_t direction,
2319 Object **ret, uint64_t *offset) {
2326 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2330 return generic_array_bisect_plus_one(f,
2331 le64toh(d->data.entry_offset),
2332 le64toh(d->data.entry_array_offset),
2333 le64toh(d->data.n_entries),
2340 int journal_file_move_to_entry_by_realtime_for_data(
2342 uint64_t data_offset,
2344 direction_t direction,
2345 Object **ret, uint64_t *offset) {
2352 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2356 return generic_array_bisect_plus_one(f,
2357 le64toh(d->data.entry_offset),
2358 le64toh(d->data.entry_array_offset),
2359 le64toh(d->data.n_entries),
2361 test_object_realtime,
2366 void journal_file_dump(JournalFile *f) {
2373 journal_file_print_header(f);
2375 p = le64toh(f->header->header_size);
2377 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2381 switch (o->object.type) {
2384 printf("Type: OBJECT_UNUSED\n");
2388 printf("Type: OBJECT_DATA\n");
2392 printf("Type: OBJECT_FIELD\n");
2396 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2397 le64toh(o->entry.seqnum),
2398 le64toh(o->entry.monotonic),
2399 le64toh(o->entry.realtime));
2402 case OBJECT_FIELD_HASH_TABLE:
2403 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2406 case OBJECT_DATA_HASH_TABLE:
2407 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2410 case OBJECT_ENTRY_ARRAY:
2411 printf("Type: OBJECT_ENTRY_ARRAY\n");
2415 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2416 le64toh(o->tag.seqnum),
2417 le64toh(o->tag.epoch));
2421 printf("Type: unknown (%i)\n", o->object.type);
2425 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2426 printf("Flags: %s\n",
2427 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2429 if (p == le64toh(f->header->tail_object_offset))
2432 p = p + ALIGN64(le64toh(o->object.size));
2437 log_error("File corrupt");
2440 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2443 x = format_timestamp(buf, l, t);
2449 void journal_file_print_header(JournalFile *f) {
2450 char a[33], b[33], c[33], d[33];
2451 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2453 char bytes[FORMAT_BYTES_MAX];
2457 printf("File Path: %s\n"
2461 "Sequential Number ID: %s\n"
2463 "Compatible Flags:%s%s\n"
2464 "Incompatible Flags:%s%s%s\n"
2465 "Header size: %"PRIu64"\n"
2466 "Arena size: %"PRIu64"\n"
2467 "Data Hash Table Size: %"PRIu64"\n"
2468 "Field Hash Table Size: %"PRIu64"\n"
2469 "Rotate Suggested: %s\n"
2470 "Head Sequential Number: %"PRIu64"\n"
2471 "Tail Sequential Number: %"PRIu64"\n"
2472 "Head Realtime Timestamp: %s\n"
2473 "Tail Realtime Timestamp: %s\n"
2474 "Tail Monotonic Timestamp: %s\n"
2475 "Objects: %"PRIu64"\n"
2476 "Entry Objects: %"PRIu64"\n",
2478 sd_id128_to_string(f->header->file_id, a),
2479 sd_id128_to_string(f->header->machine_id, b),
2480 sd_id128_to_string(f->header->boot_id, c),
2481 sd_id128_to_string(f->header->seqnum_id, d),
2482 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2483 f->header->state == STATE_ONLINE ? "ONLINE" :
2484 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2485 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2486 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2487 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2488 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2489 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2490 le64toh(f->header->header_size),
2491 le64toh(f->header->arena_size),
2492 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2493 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2494 yes_no(journal_file_rotate_suggested(f, 0)),
2495 le64toh(f->header->head_entry_seqnum),
2496 le64toh(f->header->tail_entry_seqnum),
2497 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2498 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2499 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2500 le64toh(f->header->n_objects),
2501 le64toh(f->header->n_entries));
2503 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2504 printf("Data Objects: %"PRIu64"\n"
2505 "Data Hash Table Fill: %.1f%%\n",
2506 le64toh(f->header->n_data),
2507 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2509 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2510 printf("Field Objects: %"PRIu64"\n"
2511 "Field Hash Table Fill: %.1f%%\n",
2512 le64toh(f->header->n_fields),
2513 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2515 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2516 printf("Tag Objects: %"PRIu64"\n",
2517 le64toh(f->header->n_tags));
2518 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2519 printf("Entry Array Objects: %"PRIu64"\n",
2520 le64toh(f->header->n_entry_arrays));
2522 if (fstat(f->fd, &st) >= 0)
2523 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2526 int journal_file_open(
2532 JournalMetrics *metrics,
2533 MMapCache *mmap_cache,
2534 JournalFile *template,
2535 JournalFile **ret) {
2537 bool newly_created = false;
2545 if ((flags & O_ACCMODE) != O_RDONLY &&
2546 (flags & O_ACCMODE) != O_RDWR)
2549 if (!endswith(fname, ".journal") &&
2550 !endswith(fname, ".journal~"))
2553 f = new0(JournalFile, 1);
2561 f->prot = prot_from_flags(flags);
2562 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2563 #if defined(HAVE_LZ4)
2564 f->compress_lz4 = compress;
2565 #elif defined(HAVE_XZ)
2566 f->compress_xz = compress;
2573 f->mmap = mmap_cache_ref(mmap_cache);
2575 f->mmap = mmap_cache_new();
2582 f->path = strdup(fname);
2588 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2589 if (!f->chain_cache) {
2594 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2600 r = journal_file_fstat(f);
2604 if (f->last_stat.st_size == 0 && f->writable) {
2606 /* Before we write anything, turn off COW logic. Given
2607 * our write pattern that is quite unfriendly to COW
2608 * file systems this should greatly improve
2609 * performance on COW file systems, such as btrfs, at
2610 * the expense of data integrity features (which
2611 * shouldn't be too bad, given that we do our own
2613 r = chattr_fd(f->fd, true, FS_NOCOW_FL);
2615 log_warning_errno(errno, "Failed to set file attributes: %m");
2617 /* Let's attach the creation time to the journal file,
2618 * so that the vacuuming code knows the age of this
2619 * file even if the file might end up corrupted one
2620 * day... Ideally we'd just use the creation time many
2621 * file systems maintain for each file, but there is
2622 * currently no usable API to query this, hence let's
2623 * emulate this via extended attributes. If extended
2624 * attributes are not supported we'll just skip this,
2625 * and rely solely on mtime/atime/ctime of the file. */
2627 fd_setcrtime(f->fd, 0);
2630 /* Try to load the FSPRG state, and if we can't, then
2631 * just don't do sealing */
2633 r = journal_file_fss_load(f);
2639 r = journal_file_init_header(f, template);
2643 r = journal_file_fstat(f);
2647 newly_created = true;
2650 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2655 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2663 if (!newly_created) {
2664 r = journal_file_verify_header(f);
2670 if (!newly_created && f->writable) {
2671 r = journal_file_fss_load(f);
2679 journal_default_metrics(metrics, f->fd);
2680 f->metrics = *metrics;
2681 } else if (template)
2682 f->metrics = template->metrics;
2684 r = journal_file_refresh_header(f);
2690 r = journal_file_hmac_setup(f);
2695 if (newly_created) {
2696 r = journal_file_setup_field_hash_table(f);
2700 r = journal_file_setup_data_hash_table(f);
2705 r = journal_file_append_first_tag(f);
2711 r = journal_file_map_field_hash_table(f);
2715 r = journal_file_map_data_hash_table(f);
2719 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2728 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2731 journal_file_close(f);
2736 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2737 _cleanup_free_ char *p = NULL;
2739 JournalFile *old_file, *new_file = NULL;
2747 if (!old_file->writable)
2750 if (!endswith(old_file->path, ".journal"))
2753 l = strlen(old_file->path);
2754 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2755 (int) l - 8, old_file->path,
2756 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2757 le64toh((*f)->header->head_entry_seqnum),
2758 le64toh((*f)->header->head_entry_realtime));
2762 /* Try to rename the file to the archived version. If the file
2763 * already was deleted, we'll get ENOENT, let's ignore that
2765 r = rename(old_file->path, p);
2766 if (r < 0 && errno != ENOENT)
2769 old_file->header->state = STATE_ARCHIVED;
2771 /* Currently, btrfs is not very good with out write patterns
2772 * and fragments heavily. Let's defrag our journal files when
2773 * we archive them */
2774 old_file->defrag_on_close = true;
2776 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2777 journal_file_close(old_file);
2783 int journal_file_open_reliably(
2789 JournalMetrics *metrics,
2790 MMapCache *mmap_cache,
2791 JournalFile *template,
2792 JournalFile **ret) {
2796 _cleanup_free_ char *p = NULL;
2798 r = journal_file_open(fname, flags, mode, compress, seal,
2799 metrics, mmap_cache, template, ret);
2800 if (r != -EBADMSG && /* corrupted */
2801 r != -ENODATA && /* truncated */
2802 r != -EHOSTDOWN && /* other machine */
2803 r != -EPROTONOSUPPORT && /* incompatible feature */
2804 r != -EBUSY && /* unclean shutdown */
2805 r != -ESHUTDOWN && /* already archived */
2806 r != -EIO && /* IO error, including SIGBUS on mmap */
2807 r != -EIDRM /* File has been deleted */)
2810 if ((flags & O_ACCMODE) == O_RDONLY)
2813 if (!(flags & O_CREAT))
2816 if (!endswith(fname, ".journal"))
2819 /* The file is corrupted. Rotate it away and try it again (but only once) */
2822 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2824 (unsigned long long) now(CLOCK_REALTIME),
2828 r = rename(fname, p);
2832 /* btrfs doesn't cope well with our write pattern and
2833 * fragments heavily. Let's defrag all files we rotate */
2835 (void) chattr_path(p, false, FS_NOCOW_FL);
2836 (void) btrfs_defrag(p);
2838 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2840 return journal_file_open(fname, flags, mode, compress, seal,
2841 metrics, mmap_cache, template, ret);
2844 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2846 uint64_t q, xor_hash = 0;
2859 ts.monotonic = le64toh(o->entry.monotonic);
2860 ts.realtime = le64toh(o->entry.realtime);
2862 n = journal_file_entry_n_items(o);
2863 /* alloca() can't take 0, hence let's allocate at least one */
2864 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2866 for (i = 0; i < n; i++) {
2873 q = le64toh(o->entry.items[i].object_offset);
2874 le_hash = o->entry.items[i].hash;
2876 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2880 if (le_hash != o->data.hash)
2883 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2886 /* We hit the limit on 32bit machines */
2887 if ((uint64_t) t != l)
2890 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2891 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2894 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2895 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2899 data = from->compress_buffer;
2902 return -EPROTONOSUPPORT;
2905 data = o->data.payload;
2907 r = journal_file_append_data(to, data, l, &u, &h);
2911 xor_hash ^= le64toh(u->data.hash);
2912 items[i].object_offset = htole64(h);
2913 items[i].hash = u->data.hash;
2915 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2920 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2922 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2928 void journal_default_metrics(JournalMetrics *m, int fd) {
2929 uint64_t fs_size = 0;
2931 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2936 if (fstatvfs(fd, &ss) >= 0)
2937 fs_size = ss.f_frsize * ss.f_blocks;
2939 if (m->max_use == (uint64_t) -1) {
2942 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2944 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2945 m->max_use = DEFAULT_MAX_USE_UPPER;
2947 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2948 m->max_use = DEFAULT_MAX_USE_LOWER;
2950 m->max_use = DEFAULT_MAX_USE_LOWER;
2952 m->max_use = PAGE_ALIGN(m->max_use);
2954 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2955 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2958 if (m->max_size == (uint64_t) -1) {
2959 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2961 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2962 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2964 m->max_size = PAGE_ALIGN(m->max_size);
2966 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2967 m->max_size = JOURNAL_FILE_SIZE_MIN;
2969 if (m->max_size*2 > m->max_use)
2970 m->max_use = m->max_size*2;
2972 if (m->min_size == (uint64_t) -1)
2973 m->min_size = JOURNAL_FILE_SIZE_MIN;
2975 m->min_size = PAGE_ALIGN(m->min_size);
2977 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2978 m->min_size = JOURNAL_FILE_SIZE_MIN;
2980 if (m->min_size > m->max_size)
2981 m->max_size = m->min_size;
2984 if (m->keep_free == (uint64_t) -1) {
2987 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2989 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2990 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2993 m->keep_free = DEFAULT_KEEP_FREE;
2996 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2997 format_bytes(a, sizeof(a), m->max_use),
2998 format_bytes(b, sizeof(b), m->max_size),
2999 format_bytes(c, sizeof(c), m->min_size),
3000 format_bytes(d, sizeof(d), m->keep_free));
3003 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3008 if (f->header->head_entry_realtime == 0)
3011 *from = le64toh(f->header->head_entry_realtime);
3015 if (f->header->tail_entry_realtime == 0)
3018 *to = le64toh(f->header->tail_entry_realtime);
3024 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3032 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3036 if (le64toh(o->data.n_entries) <= 0)
3040 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3044 *from = le64toh(o->entry.monotonic);
3048 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3052 r = generic_array_get_plus_one(f,
3053 le64toh(o->data.entry_offset),
3054 le64toh(o->data.entry_array_offset),
3055 le64toh(o->data.n_entries)-1,
3060 *to = le64toh(o->entry.monotonic);
3066 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3069 /* If we gained new header fields we gained new features,
3070 * hence suggest a rotation */
3071 if (le64toh(f->header->header_size) < sizeof(Header)) {
3072 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3076 /* Let's check if the hash tables grew over a certain fill
3077 * level (75%, borrowing this value from Java's hash table
3078 * implementation), and if so suggest a rotation. To calculate
3079 * the fill level we need the n_data field, which only exists
3080 * in newer versions. */
3082 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3083 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3084 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3086 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3087 le64toh(f->header->n_data),
3088 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3089 (unsigned long long) f->last_stat.st_size,
3090 f->last_stat.st_size / le64toh(f->header->n_data));
3094 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3095 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3096 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3098 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3099 le64toh(f->header->n_fields),
3100 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3104 /* Are the data objects properly indexed by field objects? */
3105 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3106 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3107 le64toh(f->header->n_data) > 0 &&
3108 le64toh(f->header->n_fields) == 0)
3111 if (max_file_usec > 0) {
3114 h = le64toh(f->header->head_entry_realtime);
3115 t = now(CLOCK_REALTIME);
3117 if (h > 0 && t > h + max_file_usec)