1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "btrfs-util.h"
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 /* Reread fstat() of the file for detecting deletions at least this often */
72 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74 /* The mmap context to use for the header we pick as one above the last defined typed */
75 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
77 static int journal_file_set_online(JournalFile *f) {
83 if (!(f->fd >= 0 && f->header))
86 if (mmap_cache_got_sigbus(f->mmap, f->fd))
89 switch(f->header->state) {
94 f->header->state = STATE_ONLINE;
103 int journal_file_set_offline(JournalFile *f) {
109 if (!(f->fd >= 0 && f->header))
112 if (f->header->state != STATE_ONLINE)
117 if (mmap_cache_got_sigbus(f->mmap, f->fd))
120 f->header->state = STATE_OFFLINE;
122 if (mmap_cache_got_sigbus(f->mmap, f->fd))
130 void journal_file_close(JournalFile *f) {
134 /* Write the final tag */
135 if (f->seal && f->writable)
136 journal_file_append_tag(f);
139 journal_file_set_offline(f);
141 if (f->mmap && f->fd >= 0)
142 mmap_cache_close_fd(f->mmap, f->fd);
144 if (f->fd >= 0 && f->defrag_on_close)
145 btrfs_defrag_fd(f->fd);
151 mmap_cache_unref(f->mmap);
153 ordered_hashmap_free_free(f->chain_cache);
155 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
156 free(f->compress_buffer);
161 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
162 else if (f->fsprg_state)
163 free(f->fsprg_state);
168 gcry_md_close(f->hmac);
174 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
181 memcpy(h.signature, HEADER_SIGNATURE, 8);
182 h.header_size = htole64(ALIGN64(sizeof(h)));
184 h.incompatible_flags |= htole32(
185 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
186 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
188 h.compatible_flags = htole32(
189 f->seal * HEADER_COMPATIBLE_SEALED);
191 r = sd_id128_randomize(&h.file_id);
196 h.seqnum_id = template->header->seqnum_id;
197 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
199 h.seqnum_id = h.file_id;
201 k = pwrite(f->fd, &h, sizeof(h), 0);
211 static int journal_file_refresh_header(JournalFile *f) {
217 r = sd_id128_get_machine(&f->header->machine_id);
221 r = sd_id128_get_boot(&boot_id);
225 if (sd_id128_equal(boot_id, f->header->boot_id))
226 f->tail_entry_monotonic_valid = true;
228 f->header->boot_id = boot_id;
230 r = journal_file_set_online(f);
232 /* Sync the online state to disk */
238 static int journal_file_verify_header(JournalFile *f) {
243 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
246 /* In both read and write mode we refuse to open files with
247 * incompatible flags we don't know */
248 flags = le32toh(f->header->incompatible_flags);
249 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
250 if (flags & ~HEADER_INCOMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
252 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
253 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
255 log_debug("Journal file %s uses incompatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
260 /* When open for writing we refuse to open files with
261 * compatible flags, too */
262 flags = le32toh(f->header->compatible_flags);
263 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
264 if (flags & ~HEADER_COMPATIBLE_ANY)
265 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
266 f->path, flags & ~HEADER_COMPATIBLE_ANY);
267 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
269 log_debug("Journal file %s uses compatible flags %"PRIx32
270 " disabled at compilation time.", f->path, flags);
271 return -EPROTONOSUPPORT;
274 if (f->header->state >= _STATE_MAX)
277 /* The first addition was n_data, so check that we are at least this large */
278 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
281 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
284 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
287 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
290 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
291 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
292 !VALID64(le64toh(f->header->tail_object_offset)) ||
293 !VALID64(le64toh(f->header->entry_array_offset)))
298 sd_id128_t machine_id;
301 r = sd_id128_get_machine(&machine_id);
305 if (!sd_id128_equal(machine_id, f->header->machine_id))
308 state = f->header->state;
310 if (state == STATE_ONLINE) {
311 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
313 } else if (state == STATE_ARCHIVED)
315 else if (state != STATE_OFFLINE) {
316 log_debug("Journal file %s has unknown state %u.", f->path, state);
321 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
322 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
324 f->seal = JOURNAL_HEADER_SEALED(f->header);
329 static int journal_file_fstat(JournalFile *f) {
333 if (fstat(f->fd, &f->last_stat) < 0)
336 f->last_stat_usec = now(CLOCK_MONOTONIC);
338 /* Refuse appending to files that are already deleted */
339 if (f->last_stat.st_nlink <= 0)
345 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
346 uint64_t old_size, new_size;
351 /* We assume that this file is not sparse, and we know that
352 * for sure, since we always call posix_fallocate()
355 if (mmap_cache_got_sigbus(f->mmap, f->fd))
359 le64toh(f->header->header_size) +
360 le64toh(f->header->arena_size);
362 new_size = PAGE_ALIGN(offset + size);
363 if (new_size < le64toh(f->header->header_size))
364 new_size = le64toh(f->header->header_size);
366 if (new_size <= old_size) {
368 /* We already pre-allocated enough space, but before
369 * we write to it, let's check with fstat() if the
370 * file got deleted, in order make sure we don't throw
371 * away the data immediately. Don't check fstat() for
372 * all writes though, but only once ever 10s. */
374 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
377 return journal_file_fstat(f);
380 /* Allocate more space. */
382 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
385 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
388 if (fstatvfs(f->fd, &svfs) >= 0) {
391 available = svfs.f_bfree * svfs.f_bsize;
393 if (available >= f->metrics.keep_free)
394 available -= f->metrics.keep_free;
398 if (new_size - old_size > available)
403 /* Increase by larger blocks at once */
404 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
405 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
406 new_size = f->metrics.max_size;
408 /* Note that the glibc fallocate() fallback is very
409 inefficient, hence we try to minimize the allocation area
411 r = posix_fallocate(f->fd, old_size, new_size - old_size);
415 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
417 return journal_file_fstat(f);
420 static unsigned type_to_context(ObjectType type) {
421 /* One context for each type, plus one catch-all for the rest */
422 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
423 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
424 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
427 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
436 /* Avoid SIGBUS on invalid accesses */
437 if (offset + size > (uint64_t) f->last_stat.st_size) {
438 /* Hmm, out of range? Let's refresh the fstat() data
439 * first, before we trust that check. */
441 r = journal_file_fstat(f);
445 if (offset + size > (uint64_t) f->last_stat.st_size)
446 return -EADDRNOTAVAIL;
449 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
452 static uint64_t minimum_header_size(Object *o) {
454 static const uint64_t table[] = {
455 [OBJECT_DATA] = sizeof(DataObject),
456 [OBJECT_FIELD] = sizeof(FieldObject),
457 [OBJECT_ENTRY] = sizeof(EntryObject),
458 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
459 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
460 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
461 [OBJECT_TAG] = sizeof(TagObject),
464 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
465 return sizeof(ObjectHeader);
467 return table[o->object.type];
470 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
479 /* Objects may only be located at multiple of 64 bit */
480 if (!VALID64(offset))
483 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
488 s = le64toh(o->object.size);
490 if (s < sizeof(ObjectHeader))
493 if (o->object.type <= OBJECT_UNUSED)
496 if (s < minimum_header_size(o))
499 if (type > OBJECT_UNUSED && o->object.type != type)
502 if (s > sizeof(ObjectHeader)) {
503 r = journal_file_move_to(f, type, false, offset, s, &t);
514 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
519 r = le64toh(f->header->tail_entry_seqnum) + 1;
522 /* If an external seqnum counter was passed, we update
523 * both the local and the external one, and set it to
524 * the maximum of both */
532 f->header->tail_entry_seqnum = htole64(r);
534 if (f->header->head_entry_seqnum == 0)
535 f->header->head_entry_seqnum = htole64(r);
540 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
547 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
548 assert(size >= sizeof(ObjectHeader));
552 r = journal_file_set_online(f);
556 p = le64toh(f->header->tail_object_offset);
558 p = le64toh(f->header->header_size);
560 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
564 p += ALIGN64(le64toh(tail->object.size));
567 r = journal_file_allocate(f, p, size);
571 r = journal_file_move_to(f, type, false, p, size, &t);
578 o->object.type = type;
579 o->object.size = htole64(size);
581 f->header->tail_object_offset = htole64(p);
582 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
590 static int journal_file_setup_data_hash_table(JournalFile *f) {
597 /* We estimate that we need 1 hash table entry per 768 of
598 journal file and we want to make sure we never get beyond
599 75% fill level. Calculate the hash table size for the
600 maximum file size based on these metrics. */
602 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
603 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
604 s = DEFAULT_DATA_HASH_TABLE_SIZE;
606 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
608 r = journal_file_append_object(f,
609 OBJECT_DATA_HASH_TABLE,
610 offsetof(Object, hash_table.items) + s,
615 memzero(o->hash_table.items, s);
617 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
618 f->header->data_hash_table_size = htole64(s);
623 static int journal_file_setup_field_hash_table(JournalFile *f) {
630 /* We use a fixed size hash table for the fields as this
631 * number should grow very slowly only */
633 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
634 r = journal_file_append_object(f,
635 OBJECT_FIELD_HASH_TABLE,
636 offsetof(Object, hash_table.items) + s,
641 memzero(o->hash_table.items, s);
643 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
644 f->header->field_hash_table_size = htole64(s);
649 static int journal_file_map_data_hash_table(JournalFile *f) {
656 p = le64toh(f->header->data_hash_table_offset);
657 s = le64toh(f->header->data_hash_table_size);
659 r = journal_file_move_to(f,
660 OBJECT_DATA_HASH_TABLE,
667 f->data_hash_table = t;
671 static int journal_file_map_field_hash_table(JournalFile *f) {
678 p = le64toh(f->header->field_hash_table_offset);
679 s = le64toh(f->header->field_hash_table_size);
681 r = journal_file_move_to(f,
682 OBJECT_FIELD_HASH_TABLE,
689 f->field_hash_table = t;
693 static int journal_file_link_field(
706 if (o->object.type != OBJECT_FIELD)
709 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
713 /* This might alter the window we are looking at */
714 o->field.next_hash_offset = o->field.head_data_offset = 0;
717 p = le64toh(f->field_hash_table[h].tail_hash_offset);
719 f->field_hash_table[h].head_hash_offset = htole64(offset);
721 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
725 o->field.next_hash_offset = htole64(offset);
728 f->field_hash_table[h].tail_hash_offset = htole64(offset);
730 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
731 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
736 static int journal_file_link_data(
749 if (o->object.type != OBJECT_DATA)
752 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
756 /* This might alter the window we are looking at */
757 o->data.next_hash_offset = o->data.next_field_offset = 0;
758 o->data.entry_offset = o->data.entry_array_offset = 0;
759 o->data.n_entries = 0;
762 p = le64toh(f->data_hash_table[h].tail_hash_offset);
764 /* Only entry in the hash table is easy */
765 f->data_hash_table[h].head_hash_offset = htole64(offset);
767 /* Move back to the previous data object, to patch in
770 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
774 o->data.next_hash_offset = htole64(offset);
777 f->data_hash_table[h].tail_hash_offset = htole64(offset);
779 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
780 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
785 int journal_file_find_field_object_with_hash(
787 const void *field, uint64_t size, uint64_t hash,
788 Object **ret, uint64_t *offset) {
790 uint64_t p, osize, h, m;
794 assert(field && size > 0);
796 osize = offsetof(Object, field.payload) + size;
798 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
804 p = le64toh(f->field_hash_table[h].head_hash_offset);
809 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
813 if (le64toh(o->field.hash) == hash &&
814 le64toh(o->object.size) == osize &&
815 memcmp(o->field.payload, field, size) == 0) {
825 p = le64toh(o->field.next_hash_offset);
831 int journal_file_find_field_object(
833 const void *field, uint64_t size,
834 Object **ret, uint64_t *offset) {
839 assert(field && size > 0);
841 hash = hash64(field, size);
843 return journal_file_find_field_object_with_hash(f,
848 int journal_file_find_data_object_with_hash(
850 const void *data, uint64_t size, uint64_t hash,
851 Object **ret, uint64_t *offset) {
853 uint64_t p, osize, h, m;
857 assert(data || size == 0);
859 osize = offsetof(Object, data.payload) + size;
861 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
866 p = le64toh(f->data_hash_table[h].head_hash_offset);
871 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
875 if (le64toh(o->data.hash) != hash)
878 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
879 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
883 l = le64toh(o->object.size);
884 if (l <= offsetof(Object, data.payload))
887 l -= offsetof(Object, data.payload);
889 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
890 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
895 memcmp(f->compress_buffer, data, size) == 0) {
906 return -EPROTONOSUPPORT;
908 } else if (le64toh(o->object.size) == osize &&
909 memcmp(o->data.payload, data, size) == 0) {
921 p = le64toh(o->data.next_hash_offset);
927 int journal_file_find_data_object(
929 const void *data, uint64_t size,
930 Object **ret, uint64_t *offset) {
935 assert(data || size == 0);
937 hash = hash64(data, size);
939 return journal_file_find_data_object_with_hash(f,
944 static int journal_file_append_field(
946 const void *field, uint64_t size,
947 Object **ret, uint64_t *offset) {
955 assert(field && size > 0);
957 hash = hash64(field, size);
959 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
973 osize = offsetof(Object, field.payload) + size;
974 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
978 o->field.hash = htole64(hash);
979 memcpy(o->field.payload, field, size);
981 r = journal_file_link_field(f, o, p, hash);
985 /* The linking might have altered the window, so let's
986 * refresh our pointer */
987 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
992 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1006 static int journal_file_append_data(
1008 const void *data, uint64_t size,
1009 Object **ret, uint64_t *offset) {
1014 int r, compression = 0;
1018 assert(data || size == 0);
1020 hash = hash64(data, size);
1022 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1036 osize = offsetof(Object, data.payload) + size;
1037 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1041 o->data.hash = htole64(hash);
1043 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1044 if (f->compress_xz &&
1045 size >= COMPRESSION_SIZE_THRESHOLD) {
1048 compression = compress_blob(data, size, o->data.payload, &rsize);
1051 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1052 o->object.flags |= compression;
1054 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1055 size, rsize, object_compressed_to_string(compression));
1060 if (!compression && size > 0)
1061 memcpy(o->data.payload, data, size);
1063 r = journal_file_link_data(f, o, p, hash);
1067 /* The linking might have altered the window, so let's
1068 * refresh our pointer */
1069 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1076 eq = memchr(data, '=', size);
1077 if (eq && eq > data) {
1081 /* Create field object ... */
1082 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1086 /* ... and link it in. */
1087 o->data.next_field_offset = fo->field.head_data_offset;
1088 fo->field.head_data_offset = le64toh(p);
1092 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1106 uint64_t journal_file_entry_n_items(Object *o) {
1109 if (o->object.type != OBJECT_ENTRY)
1112 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1115 uint64_t journal_file_entry_array_n_items(Object *o) {
1118 if (o->object.type != OBJECT_ENTRY_ARRAY)
1121 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1124 uint64_t journal_file_hash_table_n_items(Object *o) {
1127 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1128 o->object.type != OBJECT_FIELD_HASH_TABLE)
1131 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1134 static int link_entry_into_array(JournalFile *f,
1139 uint64_t n = 0, ap = 0, q, i, a, hidx;
1147 a = le64toh(*first);
1148 i = hidx = le64toh(*idx);
1151 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1155 n = journal_file_entry_array_n_items(o);
1157 o->entry_array.items[i] = htole64(p);
1158 *idx = htole64(hidx + 1);
1164 a = le64toh(o->entry_array.next_entry_array_offset);
1175 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1176 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1182 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1187 o->entry_array.items[i] = htole64(p);
1190 *first = htole64(q);
1192 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1196 o->entry_array.next_entry_array_offset = htole64(q);
1199 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1200 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1202 *idx = htole64(hidx + 1);
1207 static int link_entry_into_array_plus_one(JournalFile *f,
1222 *extra = htole64(p);
1226 i = htole64(le64toh(*idx) - 1);
1227 r = link_entry_into_array(f, first, &i, p);
1232 *idx = htole64(le64toh(*idx) + 1);
1236 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1243 p = le64toh(o->entry.items[i].object_offset);
1247 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1251 return link_entry_into_array_plus_one(f,
1252 &o->data.entry_offset,
1253 &o->data.entry_array_offset,
1258 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1266 if (o->object.type != OBJECT_ENTRY)
1269 __sync_synchronize();
1271 /* Link up the entry itself */
1272 r = link_entry_into_array(f,
1273 &f->header->entry_array_offset,
1274 &f->header->n_entries,
1279 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1281 if (f->header->head_entry_realtime == 0)
1282 f->header->head_entry_realtime = o->entry.realtime;
1284 f->header->tail_entry_realtime = o->entry.realtime;
1285 f->header->tail_entry_monotonic = o->entry.monotonic;
1287 f->tail_entry_monotonic_valid = true;
1289 /* Link up the items */
1290 n = journal_file_entry_n_items(o);
1291 for (i = 0; i < n; i++) {
1292 r = journal_file_link_entry_item(f, o, offset, i);
1300 static int journal_file_append_entry_internal(
1302 const dual_timestamp *ts,
1304 const EntryItem items[], unsigned n_items,
1306 Object **ret, uint64_t *offset) {
1313 assert(items || n_items == 0);
1316 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1318 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1322 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1323 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1324 o->entry.realtime = htole64(ts->realtime);
1325 o->entry.monotonic = htole64(ts->monotonic);
1326 o->entry.xor_hash = htole64(xor_hash);
1327 o->entry.boot_id = f->header->boot_id;
1330 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1335 r = journal_file_link_entry(f, o, np);
1348 void journal_file_post_change(JournalFile *f) {
1351 /* inotify() does not receive IN_MODIFY events from file
1352 * accesses done via mmap(). After each access we hence
1353 * trigger IN_MODIFY by truncating the journal file to its
1354 * current size which triggers IN_MODIFY. */
1356 __sync_synchronize();
1358 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1359 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1362 static int entry_item_cmp(const void *_a, const void *_b) {
1363 const EntryItem *a = _a, *b = _b;
1365 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1367 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1372 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1376 uint64_t xor_hash = 0;
1377 struct dual_timestamp _ts;
1380 assert(iovec || n_iovec == 0);
1383 dual_timestamp_get(&_ts);
1387 if (f->tail_entry_monotonic_valid &&
1388 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1392 r = journal_file_maybe_append_tag(f, ts->realtime);
1397 /* alloca() can't take 0, hence let's allocate at least one */
1398 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1400 for (i = 0; i < n_iovec; i++) {
1404 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1408 xor_hash ^= le64toh(o->data.hash);
1409 items[i].object_offset = htole64(p);
1410 items[i].hash = o->data.hash;
1413 /* Order by the position on disk, in order to improve seek
1414 * times for rotating media. */
1415 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1417 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1419 /* If the memory mapping triggered a SIGBUS then we return an
1420 * IO error and ignore the error code passed down to us, since
1421 * it is very likely just an effect of a nullified replacement
1424 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1427 journal_file_post_change(f);
1432 typedef struct ChainCacheItem {
1433 uint64_t first; /* the array at the beginning of the chain */
1434 uint64_t array; /* the cached array */
1435 uint64_t begin; /* the first item in the cached array */
1436 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1437 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1440 static void chain_cache_put(
1447 uint64_t last_index) {
1450 /* If the chain item to cache for this chain is the
1451 * first one it's not worth caching anything */
1455 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1456 ci = ordered_hashmap_steal_first(h);
1459 ci = new(ChainCacheItem, 1);
1466 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1471 assert(ci->first == first);
1476 ci->last_index = last_index;
1479 static int generic_array_get(
1483 Object **ret, uint64_t *offset) {
1486 uint64_t p = 0, a, t = 0;
1494 /* Try the chain cache first */
1495 ci = ordered_hashmap_get(f->chain_cache, &first);
1496 if (ci && i > ci->total) {
1505 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1509 k = journal_file_entry_array_n_items(o);
1511 p = le64toh(o->entry_array.items[i]);
1517 a = le64toh(o->entry_array.next_entry_array_offset);
1523 /* Let's cache this item for the next invocation */
1524 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1526 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1539 static int generic_array_get_plus_one(
1544 Object **ret, uint64_t *offset) {
1553 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1566 return generic_array_get(f, first, i-1, ret, offset);
1575 static int generic_array_bisect(
1580 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1581 direction_t direction,
1586 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1587 bool subtract_one = false;
1588 Object *o, *array = NULL;
1593 assert(test_object);
1595 /* Start with the first array in the chain */
1598 ci = ordered_hashmap_get(f->chain_cache, &first);
1599 if (ci && n > ci->total) {
1600 /* Ah, we have iterated this bisection array chain
1601 * previously! Let's see if we can skip ahead in the
1602 * chain, as far as the last time. But we can't jump
1603 * backwards in the chain, so let's check that
1606 r = test_object(f, ci->begin, needle);
1610 if (r == TEST_LEFT) {
1611 /* OK, what we are looking for is right of the
1612 * begin of this EntryArray, so let's jump
1613 * straight to previously cached array in the
1619 last_index = ci->last_index;
1624 uint64_t left, right, k, lp;
1626 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1630 k = journal_file_entry_array_n_items(array);
1636 lp = p = le64toh(array->entry_array.items[i]);
1640 r = test_object(f, p, needle);
1644 if (r == TEST_FOUND)
1645 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1647 if (r == TEST_RIGHT) {
1651 if (last_index != (uint64_t) -1) {
1652 assert(last_index <= right);
1654 /* If we cached the last index we
1655 * looked at, let's try to not to jump
1656 * too wildly around and see if we can
1657 * limit the range to look at early to
1658 * the immediate neighbors of the last
1659 * index we looked at. */
1661 if (last_index > 0) {
1662 uint64_t x = last_index - 1;
1664 p = le64toh(array->entry_array.items[x]);
1668 r = test_object(f, p, needle);
1672 if (r == TEST_FOUND)
1673 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1675 if (r == TEST_RIGHT)
1681 if (last_index < right) {
1682 uint64_t y = last_index + 1;
1684 p = le64toh(array->entry_array.items[y]);
1688 r = test_object(f, p, needle);
1692 if (r == TEST_FOUND)
1693 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1695 if (r == TEST_RIGHT)
1703 if (left == right) {
1704 if (direction == DIRECTION_UP)
1705 subtract_one = true;
1711 assert(left < right);
1712 i = (left + right) / 2;
1714 p = le64toh(array->entry_array.items[i]);
1718 r = test_object(f, p, needle);
1722 if (r == TEST_FOUND)
1723 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1725 if (r == TEST_RIGHT)
1733 if (direction == DIRECTION_UP) {
1735 subtract_one = true;
1746 last_index = (uint64_t) -1;
1747 a = le64toh(array->entry_array.next_entry_array_offset);
1753 if (subtract_one && t == 0 && i == 0)
1756 /* Let's cache this item for the next invocation */
1757 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1759 if (subtract_one && i == 0)
1761 else if (subtract_one)
1762 p = le64toh(array->entry_array.items[i-1]);
1764 p = le64toh(array->entry_array.items[i]);
1766 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1777 *idx = t + i + (subtract_one ? -1 : 0);
1782 static int generic_array_bisect_plus_one(
1788 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1789 direction_t direction,
1795 bool step_back = false;
1799 assert(test_object);
1804 /* This bisects the array in object 'first', but first checks
1806 r = test_object(f, extra, needle);
1810 if (r == TEST_FOUND)
1811 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1813 /* if we are looking with DIRECTION_UP then we need to first
1814 see if in the actual array there is a matching entry, and
1815 return the last one of that. But if there isn't any we need
1816 to return this one. Hence remember this, and return it
1819 step_back = direction == DIRECTION_UP;
1821 if (r == TEST_RIGHT) {
1822 if (direction == DIRECTION_DOWN)
1828 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1830 if (r == 0 && step_back)
1839 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1855 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1861 else if (p < needle)
1867 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1874 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1878 if (le64toh(o->entry.seqnum) == needle)
1880 else if (le64toh(o->entry.seqnum) < needle)
1886 int journal_file_move_to_entry_by_seqnum(
1889 direction_t direction,
1893 return generic_array_bisect(f,
1894 le64toh(f->header->entry_array_offset),
1895 le64toh(f->header->n_entries),
1902 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1909 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1913 if (le64toh(o->entry.realtime) == needle)
1915 else if (le64toh(o->entry.realtime) < needle)
1921 int journal_file_move_to_entry_by_realtime(
1924 direction_t direction,
1928 return generic_array_bisect(f,
1929 le64toh(f->header->entry_array_offset),
1930 le64toh(f->header->n_entries),
1932 test_object_realtime,
1937 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1944 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1948 if (le64toh(o->entry.monotonic) == needle)
1950 else if (le64toh(o->entry.monotonic) < needle)
1956 static inline int find_data_object_by_boot_id(
1961 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1963 sd_id128_to_string(boot_id, t + 9);
1964 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1967 int journal_file_move_to_entry_by_monotonic(
1971 direction_t direction,
1980 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1986 return generic_array_bisect_plus_one(f,
1987 le64toh(o->data.entry_offset),
1988 le64toh(o->data.entry_array_offset),
1989 le64toh(o->data.n_entries),
1991 test_object_monotonic,
1996 void journal_file_reset_location(JournalFile *f) {
1997 f->location_type = LOCATION_HEAD;
1998 f->current_offset = 0;
1999 f->current_seqnum = 0;
2000 f->current_realtime = 0;
2001 f->current_monotonic = 0;
2002 zero(f->current_boot_id);
2003 f->current_xor_hash = 0;
2006 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2007 f->last_direction = direction;
2008 f->location_type = LOCATION_SEEK;
2009 f->current_offset = offset;
2010 f->current_seqnum = le64toh(o->entry.seqnum);
2011 f->current_realtime = le64toh(o->entry.realtime);
2012 f->current_monotonic = le64toh(o->entry.monotonic);
2013 f->current_boot_id = o->entry.boot_id;
2014 f->current_xor_hash = le64toh(o->entry.xor_hash);
2017 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2020 assert(af->location_type == LOCATION_SEEK);
2021 assert(bf->location_type == LOCATION_SEEK);
2023 /* If contents and timestamps match, these entries are
2024 * identical, even if the seqnum does not match */
2025 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2026 af->current_monotonic == bf->current_monotonic &&
2027 af->current_realtime == bf->current_realtime &&
2028 af->current_xor_hash == bf->current_xor_hash)
2031 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2033 /* If this is from the same seqnum source, compare
2035 if (af->current_seqnum < bf->current_seqnum)
2037 if (af->current_seqnum > bf->current_seqnum)
2040 /* Wow! This is weird, different data but the same
2041 * seqnums? Something is borked, but let's make the
2042 * best of it and compare by time. */
2045 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2047 /* If the boot id matches, compare monotonic time */
2048 if (af->current_monotonic < bf->current_monotonic)
2050 if (af->current_monotonic > bf->current_monotonic)
2054 /* Otherwise, compare UTC time */
2055 if (af->current_realtime < bf->current_realtime)
2057 if (af->current_realtime > bf->current_realtime)
2060 /* Finally, compare by contents */
2061 if (af->current_xor_hash < bf->current_xor_hash)
2063 if (af->current_xor_hash > bf->current_xor_hash)
2069 int journal_file_next_entry(
2072 direction_t direction,
2073 Object **ret, uint64_t *offset) {
2080 n = le64toh(f->header->n_entries);
2085 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2087 r = generic_array_bisect(f,
2088 le64toh(f->header->entry_array_offset),
2089 le64toh(f->header->n_entries),
2098 if (direction == DIRECTION_DOWN) {
2111 /* And jump to it */
2112 r = generic_array_get(f,
2113 le64toh(f->header->entry_array_offset),
2120 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2121 log_debug("%s: entry array corrupted at entry %"PRIu64,
2132 int journal_file_next_entry_for_data(
2134 Object *o, uint64_t p,
2135 uint64_t data_offset,
2136 direction_t direction,
2137 Object **ret, uint64_t *offset) {
2144 assert(p > 0 || !o);
2146 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2150 n = le64toh(d->data.n_entries);
2155 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2157 if (o->object.type != OBJECT_ENTRY)
2160 r = generic_array_bisect_plus_one(f,
2161 le64toh(d->data.entry_offset),
2162 le64toh(d->data.entry_array_offset),
2163 le64toh(d->data.n_entries),
2173 if (direction == DIRECTION_DOWN) {
2187 return generic_array_get_plus_one(f,
2188 le64toh(d->data.entry_offset),
2189 le64toh(d->data.entry_array_offset),
2194 int journal_file_move_to_entry_by_offset_for_data(
2196 uint64_t data_offset,
2198 direction_t direction,
2199 Object **ret, uint64_t *offset) {
2206 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2210 return generic_array_bisect_plus_one(f,
2211 le64toh(d->data.entry_offset),
2212 le64toh(d->data.entry_array_offset),
2213 le64toh(d->data.n_entries),
2220 int journal_file_move_to_entry_by_monotonic_for_data(
2222 uint64_t data_offset,
2225 direction_t direction,
2226 Object **ret, uint64_t *offset) {
2234 /* First, seek by time */
2235 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2241 r = generic_array_bisect_plus_one(f,
2242 le64toh(o->data.entry_offset),
2243 le64toh(o->data.entry_array_offset),
2244 le64toh(o->data.n_entries),
2246 test_object_monotonic,
2252 /* And now, continue seeking until we find an entry that
2253 * exists in both bisection arrays */
2259 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2263 r = generic_array_bisect_plus_one(f,
2264 le64toh(d->data.entry_offset),
2265 le64toh(d->data.entry_array_offset),
2266 le64toh(d->data.n_entries),
2274 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2278 r = generic_array_bisect_plus_one(f,
2279 le64toh(o->data.entry_offset),
2280 le64toh(o->data.entry_array_offset),
2281 le64toh(o->data.n_entries),
2303 int journal_file_move_to_entry_by_seqnum_for_data(
2305 uint64_t data_offset,
2307 direction_t direction,
2308 Object **ret, uint64_t *offset) {
2315 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2319 return generic_array_bisect_plus_one(f,
2320 le64toh(d->data.entry_offset),
2321 le64toh(d->data.entry_array_offset),
2322 le64toh(d->data.n_entries),
2329 int journal_file_move_to_entry_by_realtime_for_data(
2331 uint64_t data_offset,
2333 direction_t direction,
2334 Object **ret, uint64_t *offset) {
2341 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2345 return generic_array_bisect_plus_one(f,
2346 le64toh(d->data.entry_offset),
2347 le64toh(d->data.entry_array_offset),
2348 le64toh(d->data.n_entries),
2350 test_object_realtime,
2355 void journal_file_dump(JournalFile *f) {
2362 journal_file_print_header(f);
2364 p = le64toh(f->header->header_size);
2366 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2370 switch (o->object.type) {
2373 printf("Type: OBJECT_UNUSED\n");
2377 printf("Type: OBJECT_DATA\n");
2381 printf("Type: OBJECT_FIELD\n");
2385 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2386 le64toh(o->entry.seqnum),
2387 le64toh(o->entry.monotonic),
2388 le64toh(o->entry.realtime));
2391 case OBJECT_FIELD_HASH_TABLE:
2392 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2395 case OBJECT_DATA_HASH_TABLE:
2396 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2399 case OBJECT_ENTRY_ARRAY:
2400 printf("Type: OBJECT_ENTRY_ARRAY\n");
2404 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2405 le64toh(o->tag.seqnum),
2406 le64toh(o->tag.epoch));
2410 printf("Type: unknown (%u)\n", o->object.type);
2414 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2415 printf("Flags: %s\n",
2416 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2418 if (p == le64toh(f->header->tail_object_offset))
2421 p = p + ALIGN64(le64toh(o->object.size));
2426 log_error("File corrupt");
2429 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2432 x = format_timestamp(buf, l, t);
2438 void journal_file_print_header(JournalFile *f) {
2439 char a[33], b[33], c[33], d[33];
2440 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2442 char bytes[FORMAT_BYTES_MAX];
2446 printf("File Path: %s\n"
2450 "Sequential Number ID: %s\n"
2452 "Compatible Flags:%s%s\n"
2453 "Incompatible Flags:%s%s%s\n"
2454 "Header size: %"PRIu64"\n"
2455 "Arena size: %"PRIu64"\n"
2456 "Data Hash Table Size: %"PRIu64"\n"
2457 "Field Hash Table Size: %"PRIu64"\n"
2458 "Rotate Suggested: %s\n"
2459 "Head Sequential Number: %"PRIu64"\n"
2460 "Tail Sequential Number: %"PRIu64"\n"
2461 "Head Realtime Timestamp: %s\n"
2462 "Tail Realtime Timestamp: %s\n"
2463 "Tail Monotonic Timestamp: %s\n"
2464 "Objects: %"PRIu64"\n"
2465 "Entry Objects: %"PRIu64"\n",
2467 sd_id128_to_string(f->header->file_id, a),
2468 sd_id128_to_string(f->header->machine_id, b),
2469 sd_id128_to_string(f->header->boot_id, c),
2470 sd_id128_to_string(f->header->seqnum_id, d),
2471 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2472 f->header->state == STATE_ONLINE ? "ONLINE" :
2473 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2474 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2475 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2476 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2477 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2478 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2479 le64toh(f->header->header_size),
2480 le64toh(f->header->arena_size),
2481 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2482 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2483 yes_no(journal_file_rotate_suggested(f, 0)),
2484 le64toh(f->header->head_entry_seqnum),
2485 le64toh(f->header->tail_entry_seqnum),
2486 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2487 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2488 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2489 le64toh(f->header->n_objects),
2490 le64toh(f->header->n_entries));
2492 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2493 printf("Data Objects: %"PRIu64"\n"
2494 "Data Hash Table Fill: %.1f%%\n",
2495 le64toh(f->header->n_data),
2496 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2498 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2499 printf("Field Objects: %"PRIu64"\n"
2500 "Field Hash Table Fill: %.1f%%\n",
2501 le64toh(f->header->n_fields),
2502 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2504 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2505 printf("Tag Objects: %"PRIu64"\n",
2506 le64toh(f->header->n_tags));
2507 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2508 printf("Entry Array Objects: %"PRIu64"\n",
2509 le64toh(f->header->n_entry_arrays));
2511 if (fstat(f->fd, &st) >= 0)
2512 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2515 int journal_file_open(
2521 JournalMetrics *metrics,
2522 MMapCache *mmap_cache,
2523 JournalFile *template,
2524 JournalFile **ret) {
2526 bool newly_created = false;
2534 if ((flags & O_ACCMODE) != O_RDONLY &&
2535 (flags & O_ACCMODE) != O_RDWR)
2538 if (!endswith(fname, ".journal") &&
2539 !endswith(fname, ".journal~"))
2542 f = new0(JournalFile, 1);
2550 f->prot = prot_from_flags(flags);
2551 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2552 #if defined(HAVE_LZ4)
2553 f->compress_lz4 = compress;
2554 #elif defined(HAVE_XZ)
2555 f->compress_xz = compress;
2562 f->mmap = mmap_cache_ref(mmap_cache);
2564 f->mmap = mmap_cache_new();
2571 f->path = strdup(fname);
2577 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2578 if (!f->chain_cache) {
2583 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2589 r = journal_file_fstat(f);
2593 if (f->last_stat.st_size == 0 && f->writable) {
2594 /* Let's attach the creation time to the journal file,
2595 * so that the vacuuming code knows the age of this
2596 * file even if the file might end up corrupted one
2597 * day... Ideally we'd just use the creation time many
2598 * file systems maintain for each file, but there is
2599 * currently no usable API to query this, hence let's
2600 * emulate this via extended attributes. If extended
2601 * attributes are not supported we'll just skip this,
2602 * and rely solely on mtime/atime/ctime of the file. */
2604 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2607 /* Try to load the FSPRG state, and if we can't, then
2608 * just don't do sealing */
2610 r = journal_file_fss_load(f);
2616 r = journal_file_init_header(f, template);
2620 r = journal_file_fstat(f);
2624 newly_created = true;
2627 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2632 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2640 if (!newly_created) {
2641 r = journal_file_verify_header(f);
2647 if (!newly_created && f->writable) {
2648 r = journal_file_fss_load(f);
2656 journal_default_metrics(metrics, f->fd);
2657 f->metrics = *metrics;
2658 } else if (template)
2659 f->metrics = template->metrics;
2661 r = journal_file_refresh_header(f);
2667 r = journal_file_hmac_setup(f);
2672 if (newly_created) {
2673 r = journal_file_setup_field_hash_table(f);
2677 r = journal_file_setup_data_hash_table(f);
2682 r = journal_file_append_first_tag(f);
2688 r = journal_file_map_field_hash_table(f);
2692 r = journal_file_map_data_hash_table(f);
2696 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2705 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2708 journal_file_close(f);
2713 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2714 _cleanup_free_ char *p = NULL;
2716 JournalFile *old_file, *new_file = NULL;
2724 if (!old_file->writable)
2727 if (!endswith(old_file->path, ".journal"))
2730 l = strlen(old_file->path);
2731 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2732 (int) l - 8, old_file->path,
2733 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2734 le64toh((*f)->header->head_entry_seqnum),
2735 le64toh((*f)->header->head_entry_realtime));
2739 /* Try to rename the file to the archived version. If the file
2740 * already was deleted, we'll get ENOENT, let's ignore that
2742 r = rename(old_file->path, p);
2743 if (r < 0 && errno != ENOENT)
2746 old_file->header->state = STATE_ARCHIVED;
2748 /* Currently, btrfs is not very good with out write patterns
2749 * and fragments heavily. Let's defrag our journal files when
2750 * we archive them */
2751 old_file->defrag_on_close = true;
2753 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2754 journal_file_close(old_file);
2760 int journal_file_open_reliably(
2766 JournalMetrics *metrics,
2767 MMapCache *mmap_cache,
2768 JournalFile *template,
2769 JournalFile **ret) {
2773 _cleanup_free_ char *p = NULL;
2775 r = journal_file_open(fname, flags, mode, compress, seal,
2776 metrics, mmap_cache, template, ret);
2777 if (r != -EBADMSG && /* corrupted */
2778 r != -ENODATA && /* truncated */
2779 r != -EHOSTDOWN && /* other machine */
2780 r != -EPROTONOSUPPORT && /* incompatible feature */
2781 r != -EBUSY && /* unclean shutdown */
2782 r != -ESHUTDOWN && /* already archived */
2783 r != -EIO && /* IO error, including SIGBUS on mmap */
2784 r != -EIDRM /* File has been deleted */)
2787 if ((flags & O_ACCMODE) == O_RDONLY)
2790 if (!(flags & O_CREAT))
2793 if (!endswith(fname, ".journal"))
2796 /* The file is corrupted. Rotate it away and try it again (but only once) */
2799 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2801 (unsigned long long) now(CLOCK_REALTIME),
2805 r = rename(fname, p);
2809 /* btrfs doesn't cope well with our write pattern and
2810 * fragments heavily. Let's defrag all files we rotate */
2811 (void) btrfs_defrag(p);
2813 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2815 return journal_file_open(fname, flags, mode, compress, seal,
2816 metrics, mmap_cache, template, ret);
2819 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2821 uint64_t q, xor_hash = 0;
2834 ts.monotonic = le64toh(o->entry.monotonic);
2835 ts.realtime = le64toh(o->entry.realtime);
2837 n = journal_file_entry_n_items(o);
2838 /* alloca() can't take 0, hence let's allocate at least one */
2839 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2841 for (i = 0; i < n; i++) {
2848 q = le64toh(o->entry.items[i].object_offset);
2849 le_hash = o->entry.items[i].hash;
2851 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2855 if (le_hash != o->data.hash)
2858 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2861 /* We hit the limit on 32bit machines */
2862 if ((uint64_t) t != l)
2865 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2866 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2869 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2870 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2874 data = from->compress_buffer;
2877 return -EPROTONOSUPPORT;
2880 data = o->data.payload;
2882 r = journal_file_append_data(to, data, l, &u, &h);
2886 xor_hash ^= le64toh(u->data.hash);
2887 items[i].object_offset = htole64(h);
2888 items[i].hash = u->data.hash;
2890 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2895 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2897 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2903 void journal_default_metrics(JournalMetrics *m, int fd) {
2904 uint64_t fs_size = 0;
2906 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2911 if (fstatvfs(fd, &ss) >= 0)
2912 fs_size = ss.f_frsize * ss.f_blocks;
2914 if (m->max_use == (uint64_t) -1) {
2917 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2919 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2920 m->max_use = DEFAULT_MAX_USE_UPPER;
2922 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2923 m->max_use = DEFAULT_MAX_USE_LOWER;
2925 m->max_use = DEFAULT_MAX_USE_LOWER;
2927 m->max_use = PAGE_ALIGN(m->max_use);
2929 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2930 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2933 if (m->max_size == (uint64_t) -1) {
2934 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2936 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2937 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2939 m->max_size = PAGE_ALIGN(m->max_size);
2941 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2942 m->max_size = JOURNAL_FILE_SIZE_MIN;
2944 if (m->max_size*2 > m->max_use)
2945 m->max_use = m->max_size*2;
2947 if (m->min_size == (uint64_t) -1)
2948 m->min_size = JOURNAL_FILE_SIZE_MIN;
2950 m->min_size = PAGE_ALIGN(m->min_size);
2952 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2953 m->min_size = JOURNAL_FILE_SIZE_MIN;
2955 if (m->min_size > m->max_size)
2956 m->max_size = m->min_size;
2959 if (m->keep_free == (uint64_t) -1) {
2962 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2964 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2965 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2968 m->keep_free = DEFAULT_KEEP_FREE;
2971 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2972 format_bytes(a, sizeof(a), m->max_use),
2973 format_bytes(b, sizeof(b), m->max_size),
2974 format_bytes(c, sizeof(c), m->min_size),
2975 format_bytes(d, sizeof(d), m->keep_free));
2978 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2983 if (f->header->head_entry_realtime == 0)
2986 *from = le64toh(f->header->head_entry_realtime);
2990 if (f->header->tail_entry_realtime == 0)
2993 *to = le64toh(f->header->tail_entry_realtime);
2999 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3007 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3011 if (le64toh(o->data.n_entries) <= 0)
3015 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3019 *from = le64toh(o->entry.monotonic);
3023 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3027 r = generic_array_get_plus_one(f,
3028 le64toh(o->data.entry_offset),
3029 le64toh(o->data.entry_array_offset),
3030 le64toh(o->data.n_entries)-1,
3035 *to = le64toh(o->entry.monotonic);
3041 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3044 /* If we gained new header fields we gained new features,
3045 * hence suggest a rotation */
3046 if (le64toh(f->header->header_size) < sizeof(Header)) {
3047 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3051 /* Let's check if the hash tables grew over a certain fill
3052 * level (75%, borrowing this value from Java's hash table
3053 * implementation), and if so suggest a rotation. To calculate
3054 * the fill level we need the n_data field, which only exists
3055 * in newer versions. */
3057 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3058 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3059 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3061 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3062 le64toh(f->header->n_data),
3063 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3064 (unsigned long long) f->last_stat.st_size,
3065 f->last_stat.st_size / le64toh(f->header->n_data));
3069 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3070 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3071 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3073 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3074 le64toh(f->header->n_fields),
3075 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3079 /* Are the data objects properly indexed by field objects? */
3080 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3081 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3082 le64toh(f->header->n_data) > 0 &&
3083 le64toh(f->header->n_fields) == 0)
3086 if (max_file_usec > 0) {
3089 h = le64toh(f->header->head_entry_realtime);
3090 t = now(CLOCK_REALTIME);
3092 if (h > 0 && t > h + max_file_usec)