1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
42 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
44 /* This is the minimum journal file size */
45 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
47 /* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
52 /* This is the upper bound if we deduce max_size from max_use */
53 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
55 /* This is the upper bound if we deduce the keep_free value from the
57 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
59 /* This is the keep_free value when we can't determine the system
61 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
63 /* n_data was the first entry we added after the initial file format design */
64 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
66 /* How many entries to keep in the entry array chain cache at max */
67 #define CHAIN_CACHE_MAX 20
69 /* How much to increase the journal file size at once each time we allocate something new. */
70 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
72 /* Reread fstat() of the file for detecting deletions at least this often */
73 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
75 /* The mmap context to use for the header we pick as one above the last defined typed */
76 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
78 static int journal_file_set_online(JournalFile *f) {
84 if (!(f->fd >= 0 && f->header))
87 if (mmap_cache_got_sigbus(f->mmap, f->fd))
90 switch(f->header->state) {
95 f->header->state = STATE_ONLINE;
104 int journal_file_set_offline(JournalFile *f) {
110 if (!(f->fd >= 0 && f->header))
113 if (f->header->state != STATE_ONLINE)
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
121 f->header->state = STATE_OFFLINE;
123 if (mmap_cache_got_sigbus(f->mmap, f->fd))
131 void journal_file_close(JournalFile *f) {
135 /* Write the final tag */
136 if (f->seal && f->writable)
137 journal_file_append_tag(f);
140 journal_file_set_offline(f);
142 if (f->mmap && f->fd >= 0)
143 mmap_cache_close_fd(f->mmap, f->fd);
145 if (f->fd >= 0 && f->defrag_on_close) {
147 /* Be friendly to btrfs: turn COW back on again now,
148 * and defragment the file. We won't write to the file
149 * ever again, hence remove all fragmentation, and
150 * reenable all the good bits COW usually provides
151 * (such as data checksumming). */
153 (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
154 (void) btrfs_defrag_fd(f->fd);
161 mmap_cache_unref(f->mmap);
163 ordered_hashmap_free_free(f->chain_cache);
165 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
166 free(f->compress_buffer);
171 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
172 else if (f->fsprg_state)
173 free(f->fsprg_state);
178 gcry_md_close(f->hmac);
184 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
191 memcpy(h.signature, HEADER_SIGNATURE, 8);
192 h.header_size = htole64(ALIGN64(sizeof(h)));
194 h.incompatible_flags |= htole32(
195 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
196 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
198 h.compatible_flags = htole32(
199 f->seal * HEADER_COMPATIBLE_SEALED);
201 r = sd_id128_randomize(&h.file_id);
206 h.seqnum_id = template->header->seqnum_id;
207 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
209 h.seqnum_id = h.file_id;
211 k = pwrite(f->fd, &h, sizeof(h), 0);
221 static int journal_file_refresh_header(JournalFile *f) {
227 r = sd_id128_get_machine(&f->header->machine_id);
231 r = sd_id128_get_boot(&boot_id);
235 if (sd_id128_equal(boot_id, f->header->boot_id))
236 f->tail_entry_monotonic_valid = true;
238 f->header->boot_id = boot_id;
240 r = journal_file_set_online(f);
242 /* Sync the online state to disk */
248 static int journal_file_verify_header(JournalFile *f) {
253 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
256 /* In both read and write mode we refuse to open files with
257 * incompatible flags we don't know */
258 flags = le32toh(f->header->incompatible_flags);
259 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
260 if (flags & ~HEADER_INCOMPATIBLE_ANY)
261 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
262 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
263 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
265 log_debug("Journal file %s uses incompatible flags %"PRIx32
266 " disabled at compilation time.", f->path, flags);
267 return -EPROTONOSUPPORT;
270 /* When open for writing we refuse to open files with
271 * compatible flags, too */
272 flags = le32toh(f->header->compatible_flags);
273 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
274 if (flags & ~HEADER_COMPATIBLE_ANY)
275 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
276 f->path, flags & ~HEADER_COMPATIBLE_ANY);
277 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
279 log_debug("Journal file %s uses compatible flags %"PRIx32
280 " disabled at compilation time.", f->path, flags);
281 return -EPROTONOSUPPORT;
284 if (f->header->state >= _STATE_MAX)
287 /* The first addition was n_data, so check that we are at least this large */
288 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
291 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
294 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
297 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
300 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
301 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
302 !VALID64(le64toh(f->header->tail_object_offset)) ||
303 !VALID64(le64toh(f->header->entry_array_offset)))
308 sd_id128_t machine_id;
311 r = sd_id128_get_machine(&machine_id);
315 if (!sd_id128_equal(machine_id, f->header->machine_id))
318 state = f->header->state;
320 if (state == STATE_ONLINE) {
321 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
323 } else if (state == STATE_ARCHIVED)
325 else if (state != STATE_OFFLINE) {
326 log_debug("Journal file %s has unknown state %i.", f->path, state);
331 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
332 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
334 f->seal = JOURNAL_HEADER_SEALED(f->header);
339 static int journal_file_fstat(JournalFile *f) {
343 if (fstat(f->fd, &f->last_stat) < 0)
346 f->last_stat_usec = now(CLOCK_MONOTONIC);
348 /* Refuse appending to files that are already deleted */
349 if (f->last_stat.st_nlink <= 0)
355 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
356 uint64_t old_size, new_size;
361 /* We assume that this file is not sparse, and we know that
362 * for sure, since we always call posix_fallocate()
365 if (mmap_cache_got_sigbus(f->mmap, f->fd))
369 le64toh(f->header->header_size) +
370 le64toh(f->header->arena_size);
372 new_size = PAGE_ALIGN(offset + size);
373 if (new_size < le64toh(f->header->header_size))
374 new_size = le64toh(f->header->header_size);
376 if (new_size <= old_size) {
378 /* We already pre-allocated enough space, but before
379 * we write to it, let's check with fstat() if the
380 * file got deleted, in order make sure we don't throw
381 * away the data immediately. Don't check fstat() for
382 * all writes though, but only once ever 10s. */
384 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
387 return journal_file_fstat(f);
390 /* Allocate more space. */
392 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
395 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
398 if (fstatvfs(f->fd, &svfs) >= 0) {
401 available = svfs.f_bfree * svfs.f_bsize;
403 if (available >= f->metrics.keep_free)
404 available -= f->metrics.keep_free;
408 if (new_size - old_size > available)
413 /* Increase by larger blocks at once */
414 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
415 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
416 new_size = f->metrics.max_size;
418 /* Note that the glibc fallocate() fallback is very
419 inefficient, hence we try to minimize the allocation area
421 r = posix_fallocate(f->fd, old_size, new_size - old_size);
425 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
427 return journal_file_fstat(f);
430 static unsigned type_to_context(ObjectType type) {
431 /* One context for each type, plus one catch-all for the rest */
432 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
433 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
434 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
437 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
446 /* Avoid SIGBUS on invalid accesses */
447 if (offset + size > (uint64_t) f->last_stat.st_size) {
448 /* Hmm, out of range? Let's refresh the fstat() data
449 * first, before we trust that check. */
451 r = journal_file_fstat(f);
455 if (offset + size > (uint64_t) f->last_stat.st_size)
456 return -EADDRNOTAVAIL;
459 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
462 static uint64_t minimum_header_size(Object *o) {
464 static const uint64_t table[] = {
465 [OBJECT_DATA] = sizeof(DataObject),
466 [OBJECT_FIELD] = sizeof(FieldObject),
467 [OBJECT_ENTRY] = sizeof(EntryObject),
468 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
469 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
470 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
471 [OBJECT_TAG] = sizeof(TagObject),
474 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
475 return sizeof(ObjectHeader);
477 return table[o->object.type];
480 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
489 /* Objects may only be located at multiple of 64 bit */
490 if (!VALID64(offset))
493 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
498 s = le64toh(o->object.size);
500 if (s < sizeof(ObjectHeader))
503 if (o->object.type <= OBJECT_UNUSED)
506 if (s < minimum_header_size(o))
509 if (type > OBJECT_UNUSED && o->object.type != type)
512 if (s > sizeof(ObjectHeader)) {
513 r = journal_file_move_to(f, type, false, offset, s, &t);
524 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
529 r = le64toh(f->header->tail_entry_seqnum) + 1;
532 /* If an external seqnum counter was passed, we update
533 * both the local and the external one, and set it to
534 * the maximum of both */
542 f->header->tail_entry_seqnum = htole64(r);
544 if (f->header->head_entry_seqnum == 0)
545 f->header->head_entry_seqnum = htole64(r);
550 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
557 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
558 assert(size >= sizeof(ObjectHeader));
562 r = journal_file_set_online(f);
566 p = le64toh(f->header->tail_object_offset);
568 p = le64toh(f->header->header_size);
570 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
574 p += ALIGN64(le64toh(tail->object.size));
577 r = journal_file_allocate(f, p, size);
581 r = journal_file_move_to(f, type, false, p, size, &t);
588 o->object.type = type;
589 o->object.size = htole64(size);
591 f->header->tail_object_offset = htole64(p);
592 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
600 static int journal_file_setup_data_hash_table(JournalFile *f) {
607 /* We estimate that we need 1 hash table entry per 768 of
608 journal file and we want to make sure we never get beyond
609 75% fill level. Calculate the hash table size for the
610 maximum file size based on these metrics. */
612 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
613 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
614 s = DEFAULT_DATA_HASH_TABLE_SIZE;
616 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
618 r = journal_file_append_object(f,
619 OBJECT_DATA_HASH_TABLE,
620 offsetof(Object, hash_table.items) + s,
625 memzero(o->hash_table.items, s);
627 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
628 f->header->data_hash_table_size = htole64(s);
633 static int journal_file_setup_field_hash_table(JournalFile *f) {
640 /* We use a fixed size hash table for the fields as this
641 * number should grow very slowly only */
643 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
644 r = journal_file_append_object(f,
645 OBJECT_FIELD_HASH_TABLE,
646 offsetof(Object, hash_table.items) + s,
651 memzero(o->hash_table.items, s);
653 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
654 f->header->field_hash_table_size = htole64(s);
659 static int journal_file_map_data_hash_table(JournalFile *f) {
666 p = le64toh(f->header->data_hash_table_offset);
667 s = le64toh(f->header->data_hash_table_size);
669 r = journal_file_move_to(f,
670 OBJECT_DATA_HASH_TABLE,
677 f->data_hash_table = t;
681 static int journal_file_map_field_hash_table(JournalFile *f) {
688 p = le64toh(f->header->field_hash_table_offset);
689 s = le64toh(f->header->field_hash_table_size);
691 r = journal_file_move_to(f,
692 OBJECT_FIELD_HASH_TABLE,
699 f->field_hash_table = t;
703 static int journal_file_link_field(
716 if (o->object.type != OBJECT_FIELD)
719 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
723 /* This might alter the window we are looking at */
724 o->field.next_hash_offset = o->field.head_data_offset = 0;
727 p = le64toh(f->field_hash_table[h].tail_hash_offset);
729 f->field_hash_table[h].head_hash_offset = htole64(offset);
731 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
735 o->field.next_hash_offset = htole64(offset);
738 f->field_hash_table[h].tail_hash_offset = htole64(offset);
740 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
741 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
746 static int journal_file_link_data(
759 if (o->object.type != OBJECT_DATA)
762 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
766 /* This might alter the window we are looking at */
767 o->data.next_hash_offset = o->data.next_field_offset = 0;
768 o->data.entry_offset = o->data.entry_array_offset = 0;
769 o->data.n_entries = 0;
772 p = le64toh(f->data_hash_table[h].tail_hash_offset);
774 /* Only entry in the hash table is easy */
775 f->data_hash_table[h].head_hash_offset = htole64(offset);
777 /* Move back to the previous data object, to patch in
780 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
784 o->data.next_hash_offset = htole64(offset);
787 f->data_hash_table[h].tail_hash_offset = htole64(offset);
789 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
790 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
795 int journal_file_find_field_object_with_hash(
797 const void *field, uint64_t size, uint64_t hash,
798 Object **ret, uint64_t *offset) {
800 uint64_t p, osize, h, m;
804 assert(field && size > 0);
806 osize = offsetof(Object, field.payload) + size;
808 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
814 p = le64toh(f->field_hash_table[h].head_hash_offset);
819 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
823 if (le64toh(o->field.hash) == hash &&
824 le64toh(o->object.size) == osize &&
825 memcmp(o->field.payload, field, size) == 0) {
835 p = le64toh(o->field.next_hash_offset);
841 int journal_file_find_field_object(
843 const void *field, uint64_t size,
844 Object **ret, uint64_t *offset) {
849 assert(field && size > 0);
851 hash = hash64(field, size);
853 return journal_file_find_field_object_with_hash(f,
858 int journal_file_find_data_object_with_hash(
860 const void *data, uint64_t size, uint64_t hash,
861 Object **ret, uint64_t *offset) {
863 uint64_t p, osize, h, m;
867 assert(data || size == 0);
869 osize = offsetof(Object, data.payload) + size;
871 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
876 p = le64toh(f->data_hash_table[h].head_hash_offset);
881 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
885 if (le64toh(o->data.hash) != hash)
888 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
889 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
893 l = le64toh(o->object.size);
894 if (l <= offsetof(Object, data.payload))
897 l -= offsetof(Object, data.payload);
899 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
900 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
905 memcmp(f->compress_buffer, data, size) == 0) {
916 return -EPROTONOSUPPORT;
918 } else if (le64toh(o->object.size) == osize &&
919 memcmp(o->data.payload, data, size) == 0) {
931 p = le64toh(o->data.next_hash_offset);
937 int journal_file_find_data_object(
939 const void *data, uint64_t size,
940 Object **ret, uint64_t *offset) {
945 assert(data || size == 0);
947 hash = hash64(data, size);
949 return journal_file_find_data_object_with_hash(f,
954 static int journal_file_append_field(
956 const void *field, uint64_t size,
957 Object **ret, uint64_t *offset) {
965 assert(field && size > 0);
967 hash = hash64(field, size);
969 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
983 osize = offsetof(Object, field.payload) + size;
984 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
988 o->field.hash = htole64(hash);
989 memcpy(o->field.payload, field, size);
991 r = journal_file_link_field(f, o, p, hash);
995 /* The linking might have altered the window, so let's
996 * refresh our pointer */
997 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1002 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1016 static int journal_file_append_data(
1018 const void *data, uint64_t size,
1019 Object **ret, uint64_t *offset) {
1024 int r, compression = 0;
1028 assert(data || size == 0);
1030 hash = hash64(data, size);
1032 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1046 osize = offsetof(Object, data.payload) + size;
1047 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1051 o->data.hash = htole64(hash);
1053 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1054 if (f->compress_xz &&
1055 size >= COMPRESSION_SIZE_THRESHOLD) {
1058 compression = compress_blob(data, size, o->data.payload, &rsize);
1061 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1062 o->object.flags |= compression;
1064 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1065 size, rsize, object_compressed_to_string(compression));
1070 if (!compression && size > 0)
1071 memcpy(o->data.payload, data, size);
1073 r = journal_file_link_data(f, o, p, hash);
1077 /* The linking might have altered the window, so let's
1078 * refresh our pointer */
1079 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1086 eq = memchr(data, '=', size);
1087 if (eq && eq > data) {
1091 /* Create field object ... */
1092 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1096 /* ... and link it in. */
1097 o->data.next_field_offset = fo->field.head_data_offset;
1098 fo->field.head_data_offset = le64toh(p);
1102 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1116 uint64_t journal_file_entry_n_items(Object *o) {
1119 if (o->object.type != OBJECT_ENTRY)
1122 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1125 uint64_t journal_file_entry_array_n_items(Object *o) {
1128 if (o->object.type != OBJECT_ENTRY_ARRAY)
1131 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1134 uint64_t journal_file_hash_table_n_items(Object *o) {
1137 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1138 o->object.type != OBJECT_FIELD_HASH_TABLE)
1141 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1144 static int link_entry_into_array(JournalFile *f,
1149 uint64_t n = 0, ap = 0, q, i, a, hidx;
1157 a = le64toh(*first);
1158 i = hidx = le64toh(*idx);
1161 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1165 n = journal_file_entry_array_n_items(o);
1167 o->entry_array.items[i] = htole64(p);
1168 *idx = htole64(hidx + 1);
1174 a = le64toh(o->entry_array.next_entry_array_offset);
1185 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1186 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1192 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1197 o->entry_array.items[i] = htole64(p);
1200 *first = htole64(q);
1202 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1206 o->entry_array.next_entry_array_offset = htole64(q);
1209 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1210 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1212 *idx = htole64(hidx + 1);
1217 static int link_entry_into_array_plus_one(JournalFile *f,
1232 *extra = htole64(p);
1236 i = htole64(le64toh(*idx) - 1);
1237 r = link_entry_into_array(f, first, &i, p);
1242 *idx = htole64(le64toh(*idx) + 1);
1246 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1253 p = le64toh(o->entry.items[i].object_offset);
1257 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1261 return link_entry_into_array_plus_one(f,
1262 &o->data.entry_offset,
1263 &o->data.entry_array_offset,
1268 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1276 if (o->object.type != OBJECT_ENTRY)
1279 __sync_synchronize();
1281 /* Link up the entry itself */
1282 r = link_entry_into_array(f,
1283 &f->header->entry_array_offset,
1284 &f->header->n_entries,
1289 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1291 if (f->header->head_entry_realtime == 0)
1292 f->header->head_entry_realtime = o->entry.realtime;
1294 f->header->tail_entry_realtime = o->entry.realtime;
1295 f->header->tail_entry_monotonic = o->entry.monotonic;
1297 f->tail_entry_monotonic_valid = true;
1299 /* Link up the items */
1300 n = journal_file_entry_n_items(o);
1301 for (i = 0; i < n; i++) {
1302 r = journal_file_link_entry_item(f, o, offset, i);
1310 static int journal_file_append_entry_internal(
1312 const dual_timestamp *ts,
1314 const EntryItem items[], unsigned n_items,
1316 Object **ret, uint64_t *offset) {
1323 assert(items || n_items == 0);
1326 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1328 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1332 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1333 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1334 o->entry.realtime = htole64(ts->realtime);
1335 o->entry.monotonic = htole64(ts->monotonic);
1336 o->entry.xor_hash = htole64(xor_hash);
1337 o->entry.boot_id = f->header->boot_id;
1340 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1345 r = journal_file_link_entry(f, o, np);
1358 void journal_file_post_change(JournalFile *f) {
1361 /* inotify() does not receive IN_MODIFY events from file
1362 * accesses done via mmap(). After each access we hence
1363 * trigger IN_MODIFY by truncating the journal file to its
1364 * current size which triggers IN_MODIFY. */
1366 __sync_synchronize();
1368 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1369 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1372 static int entry_item_cmp(const void *_a, const void *_b) {
1373 const EntryItem *a = _a, *b = _b;
1375 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1377 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1382 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1386 uint64_t xor_hash = 0;
1387 struct dual_timestamp _ts;
1390 assert(iovec || n_iovec == 0);
1393 dual_timestamp_get(&_ts);
1397 if (f->tail_entry_monotonic_valid &&
1398 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1402 r = journal_file_maybe_append_tag(f, ts->realtime);
1407 /* alloca() can't take 0, hence let's allocate at least one */
1408 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1410 for (i = 0; i < n_iovec; i++) {
1414 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1418 xor_hash ^= le64toh(o->data.hash);
1419 items[i].object_offset = htole64(p);
1420 items[i].hash = o->data.hash;
1423 /* Order by the position on disk, in order to improve seek
1424 * times for rotating media. */
1425 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1427 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1429 /* If the memory mapping triggered a SIGBUS then we return an
1430 * IO error and ignore the error code passed down to us, since
1431 * it is very likely just an effect of a nullified replacement
1434 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1437 journal_file_post_change(f);
1442 typedef struct ChainCacheItem {
1443 uint64_t first; /* the array at the beginning of the chain */
1444 uint64_t array; /* the cached array */
1445 uint64_t begin; /* the first item in the cached array */
1446 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1447 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1450 static void chain_cache_put(
1457 uint64_t last_index) {
1460 /* If the chain item to cache for this chain is the
1461 * first one it's not worth caching anything */
1465 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1466 ci = ordered_hashmap_steal_first(h);
1469 ci = new(ChainCacheItem, 1);
1476 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1481 assert(ci->first == first);
1486 ci->last_index = last_index;
1489 static int generic_array_get(
1493 Object **ret, uint64_t *offset) {
1496 uint64_t p = 0, a, t = 0;
1504 /* Try the chain cache first */
1505 ci = ordered_hashmap_get(f->chain_cache, &first);
1506 if (ci && i > ci->total) {
1515 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1519 k = journal_file_entry_array_n_items(o);
1521 p = le64toh(o->entry_array.items[i]);
1527 a = le64toh(o->entry_array.next_entry_array_offset);
1533 /* Let's cache this item for the next invocation */
1534 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1536 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1549 static int generic_array_get_plus_one(
1554 Object **ret, uint64_t *offset) {
1563 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1576 return generic_array_get(f, first, i-1, ret, offset);
1585 static int generic_array_bisect(
1590 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1591 direction_t direction,
1596 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1597 bool subtract_one = false;
1598 Object *o, *array = NULL;
1603 assert(test_object);
1605 /* Start with the first array in the chain */
1608 ci = ordered_hashmap_get(f->chain_cache, &first);
1609 if (ci && n > ci->total) {
1610 /* Ah, we have iterated this bisection array chain
1611 * previously! Let's see if we can skip ahead in the
1612 * chain, as far as the last time. But we can't jump
1613 * backwards in the chain, so let's check that
1616 r = test_object(f, ci->begin, needle);
1620 if (r == TEST_LEFT) {
1621 /* OK, what we are looking for is right of the
1622 * begin of this EntryArray, so let's jump
1623 * straight to previously cached array in the
1629 last_index = ci->last_index;
1634 uint64_t left, right, k, lp;
1636 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1640 k = journal_file_entry_array_n_items(array);
1646 lp = p = le64toh(array->entry_array.items[i]);
1650 r = test_object(f, p, needle);
1654 if (r == TEST_FOUND)
1655 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1657 if (r == TEST_RIGHT) {
1661 if (last_index != (uint64_t) -1) {
1662 assert(last_index <= right);
1664 /* If we cached the last index we
1665 * looked at, let's try to not to jump
1666 * too wildly around and see if we can
1667 * limit the range to look at early to
1668 * the immediate neighbors of the last
1669 * index we looked at. */
1671 if (last_index > 0) {
1672 uint64_t x = last_index - 1;
1674 p = le64toh(array->entry_array.items[x]);
1678 r = test_object(f, p, needle);
1682 if (r == TEST_FOUND)
1683 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1685 if (r == TEST_RIGHT)
1691 if (last_index < right) {
1692 uint64_t y = last_index + 1;
1694 p = le64toh(array->entry_array.items[y]);
1698 r = test_object(f, p, needle);
1702 if (r == TEST_FOUND)
1703 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1705 if (r == TEST_RIGHT)
1713 if (left == right) {
1714 if (direction == DIRECTION_UP)
1715 subtract_one = true;
1721 assert(left < right);
1722 i = (left + right) / 2;
1724 p = le64toh(array->entry_array.items[i]);
1728 r = test_object(f, p, needle);
1732 if (r == TEST_FOUND)
1733 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1735 if (r == TEST_RIGHT)
1743 if (direction == DIRECTION_UP) {
1745 subtract_one = true;
1756 last_index = (uint64_t) -1;
1757 a = le64toh(array->entry_array.next_entry_array_offset);
1763 if (subtract_one && t == 0 && i == 0)
1766 /* Let's cache this item for the next invocation */
1767 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1769 if (subtract_one && i == 0)
1771 else if (subtract_one)
1772 p = le64toh(array->entry_array.items[i-1]);
1774 p = le64toh(array->entry_array.items[i]);
1776 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1787 *idx = t + i + (subtract_one ? -1 : 0);
1792 static int generic_array_bisect_plus_one(
1798 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1799 direction_t direction,
1805 bool step_back = false;
1809 assert(test_object);
1814 /* This bisects the array in object 'first', but first checks
1816 r = test_object(f, extra, needle);
1820 if (r == TEST_FOUND)
1821 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1823 /* if we are looking with DIRECTION_UP then we need to first
1824 see if in the actual array there is a matching entry, and
1825 return the last one of that. But if there isn't any we need
1826 to return this one. Hence remember this, and return it
1829 step_back = direction == DIRECTION_UP;
1831 if (r == TEST_RIGHT) {
1832 if (direction == DIRECTION_DOWN)
1838 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1840 if (r == 0 && step_back)
1849 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1865 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1871 else if (p < needle)
1877 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1884 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1888 if (le64toh(o->entry.seqnum) == needle)
1890 else if (le64toh(o->entry.seqnum) < needle)
1896 int journal_file_move_to_entry_by_seqnum(
1899 direction_t direction,
1903 return generic_array_bisect(f,
1904 le64toh(f->header->entry_array_offset),
1905 le64toh(f->header->n_entries),
1912 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1919 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1923 if (le64toh(o->entry.realtime) == needle)
1925 else if (le64toh(o->entry.realtime) < needle)
1931 int journal_file_move_to_entry_by_realtime(
1934 direction_t direction,
1938 return generic_array_bisect(f,
1939 le64toh(f->header->entry_array_offset),
1940 le64toh(f->header->n_entries),
1942 test_object_realtime,
1947 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1954 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1958 if (le64toh(o->entry.monotonic) == needle)
1960 else if (le64toh(o->entry.monotonic) < needle)
1966 static inline int find_data_object_by_boot_id(
1971 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1973 sd_id128_to_string(boot_id, t + 9);
1974 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1977 int journal_file_move_to_entry_by_monotonic(
1981 direction_t direction,
1990 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1996 return generic_array_bisect_plus_one(f,
1997 le64toh(o->data.entry_offset),
1998 le64toh(o->data.entry_array_offset),
1999 le64toh(o->data.n_entries),
2001 test_object_monotonic,
2006 void journal_file_reset_location(JournalFile *f) {
2007 f->location_type = LOCATION_HEAD;
2008 f->current_offset = 0;
2009 f->current_seqnum = 0;
2010 f->current_realtime = 0;
2011 f->current_monotonic = 0;
2012 zero(f->current_boot_id);
2013 f->current_xor_hash = 0;
2016 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2017 f->last_direction = direction;
2018 f->location_type = LOCATION_SEEK;
2019 f->current_offset = offset;
2020 f->current_seqnum = le64toh(o->entry.seqnum);
2021 f->current_realtime = le64toh(o->entry.realtime);
2022 f->current_monotonic = le64toh(o->entry.monotonic);
2023 f->current_boot_id = o->entry.boot_id;
2024 f->current_xor_hash = le64toh(o->entry.xor_hash);
2027 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2030 assert(af->location_type == LOCATION_SEEK);
2031 assert(bf->location_type == LOCATION_SEEK);
2033 /* If contents and timestamps match, these entries are
2034 * identical, even if the seqnum does not match */
2035 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2036 af->current_monotonic == bf->current_monotonic &&
2037 af->current_realtime == bf->current_realtime &&
2038 af->current_xor_hash == bf->current_xor_hash)
2041 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2043 /* If this is from the same seqnum source, compare
2045 if (af->current_seqnum < bf->current_seqnum)
2047 if (af->current_seqnum > bf->current_seqnum)
2050 /* Wow! This is weird, different data but the same
2051 * seqnums? Something is borked, but let's make the
2052 * best of it and compare by time. */
2055 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2057 /* If the boot id matches, compare monotonic time */
2058 if (af->current_monotonic < bf->current_monotonic)
2060 if (af->current_monotonic > bf->current_monotonic)
2064 /* Otherwise, compare UTC time */
2065 if (af->current_realtime < bf->current_realtime)
2067 if (af->current_realtime > bf->current_realtime)
2070 /* Finally, compare by contents */
2071 if (af->current_xor_hash < bf->current_xor_hash)
2073 if (af->current_xor_hash > bf->current_xor_hash)
2079 int journal_file_next_entry(
2082 direction_t direction,
2083 Object **ret, uint64_t *offset) {
2090 n = le64toh(f->header->n_entries);
2095 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2097 r = generic_array_bisect(f,
2098 le64toh(f->header->entry_array_offset),
2099 le64toh(f->header->n_entries),
2108 if (direction == DIRECTION_DOWN) {
2121 /* And jump to it */
2122 r = generic_array_get(f,
2123 le64toh(f->header->entry_array_offset),
2130 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2131 log_debug("%s: entry array corrupted at entry %"PRIu64,
2142 int journal_file_next_entry_for_data(
2144 Object *o, uint64_t p,
2145 uint64_t data_offset,
2146 direction_t direction,
2147 Object **ret, uint64_t *offset) {
2154 assert(p > 0 || !o);
2156 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2160 n = le64toh(d->data.n_entries);
2165 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2167 if (o->object.type != OBJECT_ENTRY)
2170 r = generic_array_bisect_plus_one(f,
2171 le64toh(d->data.entry_offset),
2172 le64toh(d->data.entry_array_offset),
2173 le64toh(d->data.n_entries),
2183 if (direction == DIRECTION_DOWN) {
2197 return generic_array_get_plus_one(f,
2198 le64toh(d->data.entry_offset),
2199 le64toh(d->data.entry_array_offset),
2204 int journal_file_move_to_entry_by_offset_for_data(
2206 uint64_t data_offset,
2208 direction_t direction,
2209 Object **ret, uint64_t *offset) {
2216 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2220 return generic_array_bisect_plus_one(f,
2221 le64toh(d->data.entry_offset),
2222 le64toh(d->data.entry_array_offset),
2223 le64toh(d->data.n_entries),
2230 int journal_file_move_to_entry_by_monotonic_for_data(
2232 uint64_t data_offset,
2235 direction_t direction,
2236 Object **ret, uint64_t *offset) {
2244 /* First, seek by time */
2245 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2251 r = generic_array_bisect_plus_one(f,
2252 le64toh(o->data.entry_offset),
2253 le64toh(o->data.entry_array_offset),
2254 le64toh(o->data.n_entries),
2256 test_object_monotonic,
2262 /* And now, continue seeking until we find an entry that
2263 * exists in both bisection arrays */
2269 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2273 r = generic_array_bisect_plus_one(f,
2274 le64toh(d->data.entry_offset),
2275 le64toh(d->data.entry_array_offset),
2276 le64toh(d->data.n_entries),
2284 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2288 r = generic_array_bisect_plus_one(f,
2289 le64toh(o->data.entry_offset),
2290 le64toh(o->data.entry_array_offset),
2291 le64toh(o->data.n_entries),
2313 int journal_file_move_to_entry_by_seqnum_for_data(
2315 uint64_t data_offset,
2317 direction_t direction,
2318 Object **ret, uint64_t *offset) {
2325 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2329 return generic_array_bisect_plus_one(f,
2330 le64toh(d->data.entry_offset),
2331 le64toh(d->data.entry_array_offset),
2332 le64toh(d->data.n_entries),
2339 int journal_file_move_to_entry_by_realtime_for_data(
2341 uint64_t data_offset,
2343 direction_t direction,
2344 Object **ret, uint64_t *offset) {
2351 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2355 return generic_array_bisect_plus_one(f,
2356 le64toh(d->data.entry_offset),
2357 le64toh(d->data.entry_array_offset),
2358 le64toh(d->data.n_entries),
2360 test_object_realtime,
2365 void journal_file_dump(JournalFile *f) {
2372 journal_file_print_header(f);
2374 p = le64toh(f->header->header_size);
2376 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2380 switch (o->object.type) {
2383 printf("Type: OBJECT_UNUSED\n");
2387 printf("Type: OBJECT_DATA\n");
2391 printf("Type: OBJECT_FIELD\n");
2395 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2396 le64toh(o->entry.seqnum),
2397 le64toh(o->entry.monotonic),
2398 le64toh(o->entry.realtime));
2401 case OBJECT_FIELD_HASH_TABLE:
2402 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2405 case OBJECT_DATA_HASH_TABLE:
2406 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2409 case OBJECT_ENTRY_ARRAY:
2410 printf("Type: OBJECT_ENTRY_ARRAY\n");
2414 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2415 le64toh(o->tag.seqnum),
2416 le64toh(o->tag.epoch));
2420 printf("Type: unknown (%i)\n", o->object.type);
2424 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2425 printf("Flags: %s\n",
2426 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2428 if (p == le64toh(f->header->tail_object_offset))
2431 p = p + ALIGN64(le64toh(o->object.size));
2436 log_error("File corrupt");
2439 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2442 x = format_timestamp(buf, l, t);
2448 void journal_file_print_header(JournalFile *f) {
2449 char a[33], b[33], c[33], d[33];
2450 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2452 char bytes[FORMAT_BYTES_MAX];
2456 printf("File Path: %s\n"
2460 "Sequential Number ID: %s\n"
2462 "Compatible Flags:%s%s\n"
2463 "Incompatible Flags:%s%s%s\n"
2464 "Header size: %"PRIu64"\n"
2465 "Arena size: %"PRIu64"\n"
2466 "Data Hash Table Size: %"PRIu64"\n"
2467 "Field Hash Table Size: %"PRIu64"\n"
2468 "Rotate Suggested: %s\n"
2469 "Head Sequential Number: %"PRIu64"\n"
2470 "Tail Sequential Number: %"PRIu64"\n"
2471 "Head Realtime Timestamp: %s\n"
2472 "Tail Realtime Timestamp: %s\n"
2473 "Tail Monotonic Timestamp: %s\n"
2474 "Objects: %"PRIu64"\n"
2475 "Entry Objects: %"PRIu64"\n",
2477 sd_id128_to_string(f->header->file_id, a),
2478 sd_id128_to_string(f->header->machine_id, b),
2479 sd_id128_to_string(f->header->boot_id, c),
2480 sd_id128_to_string(f->header->seqnum_id, d),
2481 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2482 f->header->state == STATE_ONLINE ? "ONLINE" :
2483 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2484 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2485 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2486 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2487 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2488 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2489 le64toh(f->header->header_size),
2490 le64toh(f->header->arena_size),
2491 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2492 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2493 yes_no(journal_file_rotate_suggested(f, 0)),
2494 le64toh(f->header->head_entry_seqnum),
2495 le64toh(f->header->tail_entry_seqnum),
2496 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2497 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2498 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2499 le64toh(f->header->n_objects),
2500 le64toh(f->header->n_entries));
2502 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2503 printf("Data Objects: %"PRIu64"\n"
2504 "Data Hash Table Fill: %.1f%%\n",
2505 le64toh(f->header->n_data),
2506 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2508 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2509 printf("Field Objects: %"PRIu64"\n"
2510 "Field Hash Table Fill: %.1f%%\n",
2511 le64toh(f->header->n_fields),
2512 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2514 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2515 printf("Tag Objects: %"PRIu64"\n",
2516 le64toh(f->header->n_tags));
2517 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2518 printf("Entry Array Objects: %"PRIu64"\n",
2519 le64toh(f->header->n_entry_arrays));
2521 if (fstat(f->fd, &st) >= 0)
2522 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2525 int journal_file_open(
2531 JournalMetrics *metrics,
2532 MMapCache *mmap_cache,
2533 JournalFile *template,
2534 JournalFile **ret) {
2536 bool newly_created = false;
2544 if ((flags & O_ACCMODE) != O_RDONLY &&
2545 (flags & O_ACCMODE) != O_RDWR)
2548 if (!endswith(fname, ".journal") &&
2549 !endswith(fname, ".journal~"))
2552 f = new0(JournalFile, 1);
2560 f->prot = prot_from_flags(flags);
2561 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2562 #if defined(HAVE_LZ4)
2563 f->compress_lz4 = compress;
2564 #elif defined(HAVE_XZ)
2565 f->compress_xz = compress;
2572 f->mmap = mmap_cache_ref(mmap_cache);
2574 f->mmap = mmap_cache_new();
2581 f->path = strdup(fname);
2587 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2588 if (!f->chain_cache) {
2593 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2599 r = journal_file_fstat(f);
2603 if (f->last_stat.st_size == 0 && f->writable) {
2605 /* Before we write anything, turn off COW logic. Given
2606 * our write pattern that is quite unfriendly to COW
2607 * file systems this should greatly improve
2608 * performance on COW file systems, such as btrfs, at
2609 * the expense of data integrity features (which
2610 * shouldn't be too bad, given that we do our own
2612 r = chattr_fd(f->fd, true, FS_NOCOW_FL);
2614 log_warning_errno(errno, "Failed to set file attributes: %m");
2616 /* Let's attach the creation time to the journal file,
2617 * so that the vacuuming code knows the age of this
2618 * file even if the file might end up corrupted one
2619 * day... Ideally we'd just use the creation time many
2620 * file systems maintain for each file, but there is
2621 * currently no usable API to query this, hence let's
2622 * emulate this via extended attributes. If extended
2623 * attributes are not supported we'll just skip this,
2624 * and rely solely on mtime/atime/ctime of the file. */
2626 fd_setcrtime(f->fd, 0);
2629 /* Try to load the FSPRG state, and if we can't, then
2630 * just don't do sealing */
2632 r = journal_file_fss_load(f);
2638 r = journal_file_init_header(f, template);
2642 r = journal_file_fstat(f);
2646 newly_created = true;
2649 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2654 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2662 if (!newly_created) {
2663 r = journal_file_verify_header(f);
2669 if (!newly_created && f->writable) {
2670 r = journal_file_fss_load(f);
2678 journal_default_metrics(metrics, f->fd);
2679 f->metrics = *metrics;
2680 } else if (template)
2681 f->metrics = template->metrics;
2683 r = journal_file_refresh_header(f);
2689 r = journal_file_hmac_setup(f);
2694 if (newly_created) {
2695 r = journal_file_setup_field_hash_table(f);
2699 r = journal_file_setup_data_hash_table(f);
2704 r = journal_file_append_first_tag(f);
2710 r = journal_file_map_field_hash_table(f);
2714 r = journal_file_map_data_hash_table(f);
2718 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2727 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2730 journal_file_close(f);
2735 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2736 _cleanup_free_ char *p = NULL;
2738 JournalFile *old_file, *new_file = NULL;
2746 if (!old_file->writable)
2749 if (!endswith(old_file->path, ".journal"))
2752 l = strlen(old_file->path);
2753 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2754 (int) l - 8, old_file->path,
2755 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2756 le64toh((*f)->header->head_entry_seqnum),
2757 le64toh((*f)->header->head_entry_realtime));
2761 /* Try to rename the file to the archived version. If the file
2762 * already was deleted, we'll get ENOENT, let's ignore that
2764 r = rename(old_file->path, p);
2765 if (r < 0 && errno != ENOENT)
2768 old_file->header->state = STATE_ARCHIVED;
2770 /* Currently, btrfs is not very good with out write patterns
2771 * and fragments heavily. Let's defrag our journal files when
2772 * we archive them */
2773 old_file->defrag_on_close = true;
2775 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2776 journal_file_close(old_file);
2782 int journal_file_open_reliably(
2788 JournalMetrics *metrics,
2789 MMapCache *mmap_cache,
2790 JournalFile *template,
2791 JournalFile **ret) {
2795 _cleanup_free_ char *p = NULL;
2797 r = journal_file_open(fname, flags, mode, compress, seal,
2798 metrics, mmap_cache, template, ret);
2799 if (r != -EBADMSG && /* corrupted */
2800 r != -ENODATA && /* truncated */
2801 r != -EHOSTDOWN && /* other machine */
2802 r != -EPROTONOSUPPORT && /* incompatible feature */
2803 r != -EBUSY && /* unclean shutdown */
2804 r != -ESHUTDOWN && /* already archived */
2805 r != -EIO && /* IO error, including SIGBUS on mmap */
2806 r != -EIDRM /* File has been deleted */)
2809 if ((flags & O_ACCMODE) == O_RDONLY)
2812 if (!(flags & O_CREAT))
2815 if (!endswith(fname, ".journal"))
2818 /* The file is corrupted. Rotate it away and try it again (but only once) */
2821 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2823 (unsigned long long) now(CLOCK_REALTIME),
2827 r = rename(fname, p);
2831 /* btrfs doesn't cope well with our write pattern and
2832 * fragments heavily. Let's defrag all files we rotate */
2834 (void) chattr_path(p, false, FS_NOCOW_FL);
2835 (void) btrfs_defrag(p);
2837 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2839 return journal_file_open(fname, flags, mode, compress, seal,
2840 metrics, mmap_cache, template, ret);
2843 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2845 uint64_t q, xor_hash = 0;
2858 ts.monotonic = le64toh(o->entry.monotonic);
2859 ts.realtime = le64toh(o->entry.realtime);
2861 n = journal_file_entry_n_items(o);
2862 /* alloca() can't take 0, hence let's allocate at least one */
2863 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2865 for (i = 0; i < n; i++) {
2872 q = le64toh(o->entry.items[i].object_offset);
2873 le_hash = o->entry.items[i].hash;
2875 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2879 if (le_hash != o->data.hash)
2882 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2885 /* We hit the limit on 32bit machines */
2886 if ((uint64_t) t != l)
2889 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2890 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2893 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2894 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2898 data = from->compress_buffer;
2901 return -EPROTONOSUPPORT;
2904 data = o->data.payload;
2906 r = journal_file_append_data(to, data, l, &u, &h);
2910 xor_hash ^= le64toh(u->data.hash);
2911 items[i].object_offset = htole64(h);
2912 items[i].hash = u->data.hash;
2914 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2919 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2921 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2927 void journal_default_metrics(JournalMetrics *m, int fd) {
2928 uint64_t fs_size = 0;
2930 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2935 if (fstatvfs(fd, &ss) >= 0)
2936 fs_size = ss.f_frsize * ss.f_blocks;
2938 if (m->max_use == (uint64_t) -1) {
2941 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2943 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2944 m->max_use = DEFAULT_MAX_USE_UPPER;
2946 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2947 m->max_use = DEFAULT_MAX_USE_LOWER;
2949 m->max_use = DEFAULT_MAX_USE_LOWER;
2951 m->max_use = PAGE_ALIGN(m->max_use);
2953 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2954 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2957 if (m->max_size == (uint64_t) -1) {
2958 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2960 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2961 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2963 m->max_size = PAGE_ALIGN(m->max_size);
2965 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2966 m->max_size = JOURNAL_FILE_SIZE_MIN;
2968 if (m->max_size*2 > m->max_use)
2969 m->max_use = m->max_size*2;
2971 if (m->min_size == (uint64_t) -1)
2972 m->min_size = JOURNAL_FILE_SIZE_MIN;
2974 m->min_size = PAGE_ALIGN(m->min_size);
2976 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2977 m->min_size = JOURNAL_FILE_SIZE_MIN;
2979 if (m->min_size > m->max_size)
2980 m->max_size = m->min_size;
2983 if (m->keep_free == (uint64_t) -1) {
2986 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2988 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2989 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2992 m->keep_free = DEFAULT_KEEP_FREE;
2995 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2996 format_bytes(a, sizeof(a), m->max_use),
2997 format_bytes(b, sizeof(b), m->max_size),
2998 format_bytes(c, sizeof(c), m->min_size),
2999 format_bytes(d, sizeof(d), m->keep_free));
3002 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3007 if (f->header->head_entry_realtime == 0)
3010 *from = le64toh(f->header->head_entry_realtime);
3014 if (f->header->tail_entry_realtime == 0)
3017 *to = le64toh(f->header->tail_entry_realtime);
3023 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3031 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3035 if (le64toh(o->data.n_entries) <= 0)
3039 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3043 *from = le64toh(o->entry.monotonic);
3047 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3051 r = generic_array_get_plus_one(f,
3052 le64toh(o->data.entry_offset),
3053 le64toh(o->data.entry_array_offset),
3054 le64toh(o->data.n_entries)-1,
3059 *to = le64toh(o->entry.monotonic);
3065 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3068 /* If we gained new header fields we gained new features,
3069 * hence suggest a rotation */
3070 if (le64toh(f->header->header_size) < sizeof(Header)) {
3071 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3075 /* Let's check if the hash tables grew over a certain fill
3076 * level (75%, borrowing this value from Java's hash table
3077 * implementation), and if so suggest a rotation. To calculate
3078 * the fill level we need the n_data field, which only exists
3079 * in newer versions. */
3081 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3082 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3083 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3085 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3086 le64toh(f->header->n_data),
3087 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3088 (unsigned long long) f->last_stat.st_size,
3089 f->last_stat.st_size / le64toh(f->header->n_data));
3093 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3094 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3095 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3097 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3098 le64toh(f->header->n_fields),
3099 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3103 /* Are the data objects properly indexed by field objects? */
3104 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3105 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3106 le64toh(f->header->n_data) > 0 &&
3107 le64toh(f->header->n_fields) == 0)
3110 if (max_file_usec > 0) {
3113 h = le64toh(f->header->head_entry_realtime);
3114 t = now(CLOCK_REALTIME);
3116 if (h > 0 && t > h + max_file_usec)