1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include "btrfs-util.h"
32 #include "journal-def.h"
33 #include "journal-file.h"
34 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 /* Reread fstat() of the file for detecting deletions at least this often */
72 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74 /* The mmap context to use for the header we pick as one above the last defined typed */
75 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
77 static int journal_file_set_online(JournalFile *f) {
83 if (!(f->fd >= 0 && f->header))
86 if (mmap_cache_got_sigbus(f->mmap, f->fd))
89 switch(f->header->state) {
94 f->header->state = STATE_ONLINE;
103 int journal_file_set_offline(JournalFile *f) {
109 if (!(f->fd >= 0 && f->header))
112 if (f->header->state != STATE_ONLINE)
117 if (mmap_cache_got_sigbus(f->mmap, f->fd))
120 f->header->state = STATE_OFFLINE;
122 if (mmap_cache_got_sigbus(f->mmap, f->fd))
130 void journal_file_close(JournalFile *f) {
134 /* Write the final tag */
135 if (f->seal && f->writable)
136 journal_file_append_tag(f);
139 journal_file_set_offline(f);
141 if (f->mmap && f->fd >= 0)
142 mmap_cache_close_fd(f->mmap, f->fd);
144 if (f->fd >= 0 && f->defrag_on_close) {
146 /* Be friendly to btrfs: turn COW back on again now,
147 * and defragment the file. We won't write to the file
148 * ever again, hence remove all fragmentation, and
149 * reenable all the good bits COW usually provides
150 * (such as data checksumming). */
152 (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
153 (void) btrfs_defrag_fd(f->fd);
160 mmap_cache_unref(f->mmap);
162 ordered_hashmap_free_free(f->chain_cache);
164 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
165 free(f->compress_buffer);
170 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
171 else if (f->fsprg_state)
172 free(f->fsprg_state);
177 gcry_md_close(f->hmac);
183 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
190 memcpy(h.signature, HEADER_SIGNATURE, 8);
191 h.header_size = htole64(ALIGN64(sizeof(h)));
193 h.incompatible_flags |= htole32(
194 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
195 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
197 h.compatible_flags = htole32(
198 f->seal * HEADER_COMPATIBLE_SEALED);
200 r = sd_id128_randomize(&h.file_id);
205 h.seqnum_id = template->header->seqnum_id;
206 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
208 h.seqnum_id = h.file_id;
210 k = pwrite(f->fd, &h, sizeof(h), 0);
220 static int journal_file_refresh_header(JournalFile *f) {
226 r = sd_id128_get_machine(&f->header->machine_id);
230 r = sd_id128_get_boot(&boot_id);
234 if (sd_id128_equal(boot_id, f->header->boot_id))
235 f->tail_entry_monotonic_valid = true;
237 f->header->boot_id = boot_id;
239 r = journal_file_set_online(f);
241 /* Sync the online state to disk */
247 static int journal_file_verify_header(JournalFile *f) {
252 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
255 /* In both read and write mode we refuse to open files with
256 * incompatible flags we don't know */
257 flags = le32toh(f->header->incompatible_flags);
258 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
259 if (flags & ~HEADER_INCOMPATIBLE_ANY)
260 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
261 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
262 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
264 log_debug("Journal file %s uses incompatible flags %"PRIx32
265 " disabled at compilation time.", f->path, flags);
266 return -EPROTONOSUPPORT;
269 /* When open for writing we refuse to open files with
270 * compatible flags, too */
271 flags = le32toh(f->header->compatible_flags);
272 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
273 if (flags & ~HEADER_COMPATIBLE_ANY)
274 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
275 f->path, flags & ~HEADER_COMPATIBLE_ANY);
276 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
278 log_debug("Journal file %s uses compatible flags %"PRIx32
279 " disabled at compilation time.", f->path, flags);
280 return -EPROTONOSUPPORT;
283 if (f->header->state >= _STATE_MAX)
286 /* The first addition was n_data, so check that we are at least this large */
287 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
290 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
293 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
296 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
299 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
300 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
301 !VALID64(le64toh(f->header->tail_object_offset)) ||
302 !VALID64(le64toh(f->header->entry_array_offset)))
307 sd_id128_t machine_id;
310 r = sd_id128_get_machine(&machine_id);
314 if (!sd_id128_equal(machine_id, f->header->machine_id))
317 state = f->header->state;
319 if (state == STATE_ONLINE) {
320 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
322 } else if (state == STATE_ARCHIVED)
324 else if (state != STATE_OFFLINE) {
325 log_debug("Journal file %s has unknown state %i.", f->path, state);
330 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
331 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
333 f->seal = JOURNAL_HEADER_SEALED(f->header);
338 static int journal_file_fstat(JournalFile *f) {
342 if (fstat(f->fd, &f->last_stat) < 0)
345 f->last_stat_usec = now(CLOCK_MONOTONIC);
347 /* Refuse appending to files that are already deleted */
348 if (f->last_stat.st_nlink <= 0)
354 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
355 uint64_t old_size, new_size;
360 /* We assume that this file is not sparse, and we know that
361 * for sure, since we always call posix_fallocate()
364 if (mmap_cache_got_sigbus(f->mmap, f->fd))
368 le64toh(f->header->header_size) +
369 le64toh(f->header->arena_size);
371 new_size = PAGE_ALIGN(offset + size);
372 if (new_size < le64toh(f->header->header_size))
373 new_size = le64toh(f->header->header_size);
375 if (new_size <= old_size) {
377 /* We already pre-allocated enough space, but before
378 * we write to it, let's check with fstat() if the
379 * file got deleted, in order make sure we don't throw
380 * away the data immediately. Don't check fstat() for
381 * all writes though, but only once ever 10s. */
383 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
386 return journal_file_fstat(f);
389 /* Allocate more space. */
391 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
394 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
397 if (fstatvfs(f->fd, &svfs) >= 0) {
400 available = svfs.f_bfree * svfs.f_bsize;
402 if (available >= f->metrics.keep_free)
403 available -= f->metrics.keep_free;
407 if (new_size - old_size > available)
412 /* Increase by larger blocks at once */
413 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
414 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
415 new_size = f->metrics.max_size;
417 /* Note that the glibc fallocate() fallback is very
418 inefficient, hence we try to minimize the allocation area
420 r = posix_fallocate(f->fd, old_size, new_size - old_size);
424 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
426 return journal_file_fstat(f);
429 static unsigned type_to_context(ObjectType type) {
430 /* One context for each type, plus one catch-all for the rest */
431 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
432 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
433 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
436 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
445 /* Avoid SIGBUS on invalid accesses */
446 if (offset + size > (uint64_t) f->last_stat.st_size) {
447 /* Hmm, out of range? Let's refresh the fstat() data
448 * first, before we trust that check. */
450 r = journal_file_fstat(f);
454 if (offset + size > (uint64_t) f->last_stat.st_size)
455 return -EADDRNOTAVAIL;
458 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
461 static uint64_t minimum_header_size(Object *o) {
463 static const uint64_t table[] = {
464 [OBJECT_DATA] = sizeof(DataObject),
465 [OBJECT_FIELD] = sizeof(FieldObject),
466 [OBJECT_ENTRY] = sizeof(EntryObject),
467 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
468 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
469 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
470 [OBJECT_TAG] = sizeof(TagObject),
473 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
474 return sizeof(ObjectHeader);
476 return table[o->object.type];
479 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
488 /* Objects may only be located at multiple of 64 bit */
489 if (!VALID64(offset))
492 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
497 s = le64toh(o->object.size);
499 if (s < sizeof(ObjectHeader))
502 if (o->object.type <= OBJECT_UNUSED)
505 if (s < minimum_header_size(o))
508 if (type > OBJECT_UNUSED && o->object.type != type)
511 if (s > sizeof(ObjectHeader)) {
512 r = journal_file_move_to(f, type, false, offset, s, &t);
523 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
528 r = le64toh(f->header->tail_entry_seqnum) + 1;
531 /* If an external seqnum counter was passed, we update
532 * both the local and the external one, and set it to
533 * the maximum of both */
541 f->header->tail_entry_seqnum = htole64(r);
543 if (f->header->head_entry_seqnum == 0)
544 f->header->head_entry_seqnum = htole64(r);
549 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
556 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
557 assert(size >= sizeof(ObjectHeader));
561 r = journal_file_set_online(f);
565 p = le64toh(f->header->tail_object_offset);
567 p = le64toh(f->header->header_size);
569 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
573 p += ALIGN64(le64toh(tail->object.size));
576 r = journal_file_allocate(f, p, size);
580 r = journal_file_move_to(f, type, false, p, size, &t);
587 o->object.type = type;
588 o->object.size = htole64(size);
590 f->header->tail_object_offset = htole64(p);
591 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
599 static int journal_file_setup_data_hash_table(JournalFile *f) {
606 /* We estimate that we need 1 hash table entry per 768 of
607 journal file and we want to make sure we never get beyond
608 75% fill level. Calculate the hash table size for the
609 maximum file size based on these metrics. */
611 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
612 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
613 s = DEFAULT_DATA_HASH_TABLE_SIZE;
615 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
617 r = journal_file_append_object(f,
618 OBJECT_DATA_HASH_TABLE,
619 offsetof(Object, hash_table.items) + s,
624 memzero(o->hash_table.items, s);
626 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
627 f->header->data_hash_table_size = htole64(s);
632 static int journal_file_setup_field_hash_table(JournalFile *f) {
639 /* We use a fixed size hash table for the fields as this
640 * number should grow very slowly only */
642 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
643 r = journal_file_append_object(f,
644 OBJECT_FIELD_HASH_TABLE,
645 offsetof(Object, hash_table.items) + s,
650 memzero(o->hash_table.items, s);
652 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
653 f->header->field_hash_table_size = htole64(s);
658 static int journal_file_map_data_hash_table(JournalFile *f) {
665 p = le64toh(f->header->data_hash_table_offset);
666 s = le64toh(f->header->data_hash_table_size);
668 r = journal_file_move_to(f,
669 OBJECT_DATA_HASH_TABLE,
676 f->data_hash_table = t;
680 static int journal_file_map_field_hash_table(JournalFile *f) {
687 p = le64toh(f->header->field_hash_table_offset);
688 s = le64toh(f->header->field_hash_table_size);
690 r = journal_file_move_to(f,
691 OBJECT_FIELD_HASH_TABLE,
698 f->field_hash_table = t;
702 static int journal_file_link_field(
715 if (o->object.type != OBJECT_FIELD)
718 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
722 /* This might alter the window we are looking at */
723 o->field.next_hash_offset = o->field.head_data_offset = 0;
726 p = le64toh(f->field_hash_table[h].tail_hash_offset);
728 f->field_hash_table[h].head_hash_offset = htole64(offset);
730 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
734 o->field.next_hash_offset = htole64(offset);
737 f->field_hash_table[h].tail_hash_offset = htole64(offset);
739 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
740 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
745 static int journal_file_link_data(
758 if (o->object.type != OBJECT_DATA)
761 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
765 /* This might alter the window we are looking at */
766 o->data.next_hash_offset = o->data.next_field_offset = 0;
767 o->data.entry_offset = o->data.entry_array_offset = 0;
768 o->data.n_entries = 0;
771 p = le64toh(f->data_hash_table[h].tail_hash_offset);
773 /* Only entry in the hash table is easy */
774 f->data_hash_table[h].head_hash_offset = htole64(offset);
776 /* Move back to the previous data object, to patch in
779 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
783 o->data.next_hash_offset = htole64(offset);
786 f->data_hash_table[h].tail_hash_offset = htole64(offset);
788 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
789 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
794 int journal_file_find_field_object_with_hash(
796 const void *field, uint64_t size, uint64_t hash,
797 Object **ret, uint64_t *offset) {
799 uint64_t p, osize, h, m;
803 assert(field && size > 0);
805 osize = offsetof(Object, field.payload) + size;
807 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
813 p = le64toh(f->field_hash_table[h].head_hash_offset);
818 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
822 if (le64toh(o->field.hash) == hash &&
823 le64toh(o->object.size) == osize &&
824 memcmp(o->field.payload, field, size) == 0) {
834 p = le64toh(o->field.next_hash_offset);
840 int journal_file_find_field_object(
842 const void *field, uint64_t size,
843 Object **ret, uint64_t *offset) {
848 assert(field && size > 0);
850 hash = hash64(field, size);
852 return journal_file_find_field_object_with_hash(f,
857 int journal_file_find_data_object_with_hash(
859 const void *data, uint64_t size, uint64_t hash,
860 Object **ret, uint64_t *offset) {
862 uint64_t p, osize, h, m;
866 assert(data || size == 0);
868 osize = offsetof(Object, data.payload) + size;
870 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
875 p = le64toh(f->data_hash_table[h].head_hash_offset);
880 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
884 if (le64toh(o->data.hash) != hash)
887 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
888 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
892 l = le64toh(o->object.size);
893 if (l <= offsetof(Object, data.payload))
896 l -= offsetof(Object, data.payload);
898 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
899 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
904 memcmp(f->compress_buffer, data, size) == 0) {
915 return -EPROTONOSUPPORT;
917 } else if (le64toh(o->object.size) == osize &&
918 memcmp(o->data.payload, data, size) == 0) {
930 p = le64toh(o->data.next_hash_offset);
936 int journal_file_find_data_object(
938 const void *data, uint64_t size,
939 Object **ret, uint64_t *offset) {
944 assert(data || size == 0);
946 hash = hash64(data, size);
948 return journal_file_find_data_object_with_hash(f,
953 static int journal_file_append_field(
955 const void *field, uint64_t size,
956 Object **ret, uint64_t *offset) {
964 assert(field && size > 0);
966 hash = hash64(field, size);
968 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
982 osize = offsetof(Object, field.payload) + size;
983 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
987 o->field.hash = htole64(hash);
988 memcpy(o->field.payload, field, size);
990 r = journal_file_link_field(f, o, p, hash);
994 /* The linking might have altered the window, so let's
995 * refresh our pointer */
996 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1001 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1015 static int journal_file_append_data(
1017 const void *data, uint64_t size,
1018 Object **ret, uint64_t *offset) {
1023 int r, compression = 0;
1027 assert(data || size == 0);
1029 hash = hash64(data, size);
1031 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1045 osize = offsetof(Object, data.payload) + size;
1046 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1050 o->data.hash = htole64(hash);
1052 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1053 if (f->compress_xz &&
1054 size >= COMPRESSION_SIZE_THRESHOLD) {
1057 compression = compress_blob(data, size, o->data.payload, &rsize);
1060 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1061 o->object.flags |= compression;
1063 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1064 size, rsize, object_compressed_to_string(compression));
1069 if (!compression && size > 0)
1070 memcpy(o->data.payload, data, size);
1072 r = journal_file_link_data(f, o, p, hash);
1076 /* The linking might have altered the window, so let's
1077 * refresh our pointer */
1078 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1085 eq = memchr(data, '=', size);
1086 if (eq && eq > data) {
1090 /* Create field object ... */
1091 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1095 /* ... and link it in. */
1096 o->data.next_field_offset = fo->field.head_data_offset;
1097 fo->field.head_data_offset = le64toh(p);
1101 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1115 uint64_t journal_file_entry_n_items(Object *o) {
1118 if (o->object.type != OBJECT_ENTRY)
1121 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1124 uint64_t journal_file_entry_array_n_items(Object *o) {
1127 if (o->object.type != OBJECT_ENTRY_ARRAY)
1130 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1133 uint64_t journal_file_hash_table_n_items(Object *o) {
1136 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1137 o->object.type != OBJECT_FIELD_HASH_TABLE)
1140 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1143 static int link_entry_into_array(JournalFile *f,
1148 uint64_t n = 0, ap = 0, q, i, a, hidx;
1156 a = le64toh(*first);
1157 i = hidx = le64toh(*idx);
1160 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1164 n = journal_file_entry_array_n_items(o);
1166 o->entry_array.items[i] = htole64(p);
1167 *idx = htole64(hidx + 1);
1173 a = le64toh(o->entry_array.next_entry_array_offset);
1184 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1185 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1191 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1196 o->entry_array.items[i] = htole64(p);
1199 *first = htole64(q);
1201 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1205 o->entry_array.next_entry_array_offset = htole64(q);
1208 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1209 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1211 *idx = htole64(hidx + 1);
1216 static int link_entry_into_array_plus_one(JournalFile *f,
1231 *extra = htole64(p);
1235 i = htole64(le64toh(*idx) - 1);
1236 r = link_entry_into_array(f, first, &i, p);
1241 *idx = htole64(le64toh(*idx) + 1);
1245 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1252 p = le64toh(o->entry.items[i].object_offset);
1256 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1260 return link_entry_into_array_plus_one(f,
1261 &o->data.entry_offset,
1262 &o->data.entry_array_offset,
1267 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1275 if (o->object.type != OBJECT_ENTRY)
1278 __sync_synchronize();
1280 /* Link up the entry itself */
1281 r = link_entry_into_array(f,
1282 &f->header->entry_array_offset,
1283 &f->header->n_entries,
1288 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1290 if (f->header->head_entry_realtime == 0)
1291 f->header->head_entry_realtime = o->entry.realtime;
1293 f->header->tail_entry_realtime = o->entry.realtime;
1294 f->header->tail_entry_monotonic = o->entry.monotonic;
1296 f->tail_entry_monotonic_valid = true;
1298 /* Link up the items */
1299 n = journal_file_entry_n_items(o);
1300 for (i = 0; i < n; i++) {
1301 r = journal_file_link_entry_item(f, o, offset, i);
1309 static int journal_file_append_entry_internal(
1311 const dual_timestamp *ts,
1313 const EntryItem items[], unsigned n_items,
1315 Object **ret, uint64_t *offset) {
1322 assert(items || n_items == 0);
1325 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1327 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1331 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1332 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1333 o->entry.realtime = htole64(ts->realtime);
1334 o->entry.monotonic = htole64(ts->monotonic);
1335 o->entry.xor_hash = htole64(xor_hash);
1336 o->entry.boot_id = f->header->boot_id;
1339 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1344 r = journal_file_link_entry(f, o, np);
1357 void journal_file_post_change(JournalFile *f) {
1360 /* inotify() does not receive IN_MODIFY events from file
1361 * accesses done via mmap(). After each access we hence
1362 * trigger IN_MODIFY by truncating the journal file to its
1363 * current size which triggers IN_MODIFY. */
1365 __sync_synchronize();
1367 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1368 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1371 static int entry_item_cmp(const void *_a, const void *_b) {
1372 const EntryItem *a = _a, *b = _b;
1374 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1376 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1381 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1385 uint64_t xor_hash = 0;
1386 struct dual_timestamp _ts;
1389 assert(iovec || n_iovec == 0);
1392 dual_timestamp_get(&_ts);
1396 if (f->tail_entry_monotonic_valid &&
1397 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1401 r = journal_file_maybe_append_tag(f, ts->realtime);
1406 /* alloca() can't take 0, hence let's allocate at least one */
1407 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1409 for (i = 0; i < n_iovec; i++) {
1413 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1417 xor_hash ^= le64toh(o->data.hash);
1418 items[i].object_offset = htole64(p);
1419 items[i].hash = o->data.hash;
1422 /* Order by the position on disk, in order to improve seek
1423 * times for rotating media. */
1424 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1426 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1428 /* If the memory mapping triggered a SIGBUS then we return an
1429 * IO error and ignore the error code passed down to us, since
1430 * it is very likely just an effect of a nullified replacement
1433 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1436 journal_file_post_change(f);
1441 typedef struct ChainCacheItem {
1442 uint64_t first; /* the array at the beginning of the chain */
1443 uint64_t array; /* the cached array */
1444 uint64_t begin; /* the first item in the cached array */
1445 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1446 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1449 static void chain_cache_put(
1456 uint64_t last_index) {
1459 /* If the chain item to cache for this chain is the
1460 * first one it's not worth caching anything */
1464 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1465 ci = ordered_hashmap_steal_first(h);
1468 ci = new(ChainCacheItem, 1);
1475 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1480 assert(ci->first == first);
1485 ci->last_index = last_index;
1488 static int generic_array_get(
1492 Object **ret, uint64_t *offset) {
1495 uint64_t p = 0, a, t = 0;
1503 /* Try the chain cache first */
1504 ci = ordered_hashmap_get(f->chain_cache, &first);
1505 if (ci && i > ci->total) {
1514 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1518 k = journal_file_entry_array_n_items(o);
1520 p = le64toh(o->entry_array.items[i]);
1526 a = le64toh(o->entry_array.next_entry_array_offset);
1532 /* Let's cache this item for the next invocation */
1533 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1535 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1548 static int generic_array_get_plus_one(
1553 Object **ret, uint64_t *offset) {
1562 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1575 return generic_array_get(f, first, i-1, ret, offset);
1584 static int generic_array_bisect(
1589 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1590 direction_t direction,
1595 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1596 bool subtract_one = false;
1597 Object *o, *array = NULL;
1602 assert(test_object);
1604 /* Start with the first array in the chain */
1607 ci = ordered_hashmap_get(f->chain_cache, &first);
1608 if (ci && n > ci->total) {
1609 /* Ah, we have iterated this bisection array chain
1610 * previously! Let's see if we can skip ahead in the
1611 * chain, as far as the last time. But we can't jump
1612 * backwards in the chain, so let's check that
1615 r = test_object(f, ci->begin, needle);
1619 if (r == TEST_LEFT) {
1620 /* OK, what we are looking for is right of the
1621 * begin of this EntryArray, so let's jump
1622 * straight to previously cached array in the
1628 last_index = ci->last_index;
1633 uint64_t left, right, k, lp;
1635 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1639 k = journal_file_entry_array_n_items(array);
1645 lp = p = le64toh(array->entry_array.items[i]);
1649 r = test_object(f, p, needle);
1653 if (r == TEST_FOUND)
1654 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1656 if (r == TEST_RIGHT) {
1660 if (last_index != (uint64_t) -1) {
1661 assert(last_index <= right);
1663 /* If we cached the last index we
1664 * looked at, let's try to not to jump
1665 * too wildly around and see if we can
1666 * limit the range to look at early to
1667 * the immediate neighbors of the last
1668 * index we looked at. */
1670 if (last_index > 0) {
1671 uint64_t x = last_index - 1;
1673 p = le64toh(array->entry_array.items[x]);
1677 r = test_object(f, p, needle);
1681 if (r == TEST_FOUND)
1682 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1684 if (r == TEST_RIGHT)
1690 if (last_index < right) {
1691 uint64_t y = last_index + 1;
1693 p = le64toh(array->entry_array.items[y]);
1697 r = test_object(f, p, needle);
1701 if (r == TEST_FOUND)
1702 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1704 if (r == TEST_RIGHT)
1712 if (left == right) {
1713 if (direction == DIRECTION_UP)
1714 subtract_one = true;
1720 assert(left < right);
1721 i = (left + right) / 2;
1723 p = le64toh(array->entry_array.items[i]);
1727 r = test_object(f, p, needle);
1731 if (r == TEST_FOUND)
1732 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1734 if (r == TEST_RIGHT)
1742 if (direction == DIRECTION_UP) {
1744 subtract_one = true;
1755 last_index = (uint64_t) -1;
1756 a = le64toh(array->entry_array.next_entry_array_offset);
1762 if (subtract_one && t == 0 && i == 0)
1765 /* Let's cache this item for the next invocation */
1766 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1768 if (subtract_one && i == 0)
1770 else if (subtract_one)
1771 p = le64toh(array->entry_array.items[i-1]);
1773 p = le64toh(array->entry_array.items[i]);
1775 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1786 *idx = t + i + (subtract_one ? -1 : 0);
1791 static int generic_array_bisect_plus_one(
1797 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1798 direction_t direction,
1804 bool step_back = false;
1808 assert(test_object);
1813 /* This bisects the array in object 'first', but first checks
1815 r = test_object(f, extra, needle);
1819 if (r == TEST_FOUND)
1820 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1822 /* if we are looking with DIRECTION_UP then we need to first
1823 see if in the actual array there is a matching entry, and
1824 return the last one of that. But if there isn't any we need
1825 to return this one. Hence remember this, and return it
1828 step_back = direction == DIRECTION_UP;
1830 if (r == TEST_RIGHT) {
1831 if (direction == DIRECTION_DOWN)
1837 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1839 if (r == 0 && step_back)
1848 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1864 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1870 else if (p < needle)
1876 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1883 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1887 if (le64toh(o->entry.seqnum) == needle)
1889 else if (le64toh(o->entry.seqnum) < needle)
1895 int journal_file_move_to_entry_by_seqnum(
1898 direction_t direction,
1902 return generic_array_bisect(f,
1903 le64toh(f->header->entry_array_offset),
1904 le64toh(f->header->n_entries),
1911 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1918 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1922 if (le64toh(o->entry.realtime) == needle)
1924 else if (le64toh(o->entry.realtime) < needle)
1930 int journal_file_move_to_entry_by_realtime(
1933 direction_t direction,
1937 return generic_array_bisect(f,
1938 le64toh(f->header->entry_array_offset),
1939 le64toh(f->header->n_entries),
1941 test_object_realtime,
1946 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1953 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1957 if (le64toh(o->entry.monotonic) == needle)
1959 else if (le64toh(o->entry.monotonic) < needle)
1965 static int find_data_object_by_boot_id(
1971 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1973 sd_id128_to_string(boot_id, t + 9);
1974 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1977 int journal_file_move_to_entry_by_monotonic(
1981 direction_t direction,
1990 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1996 return generic_array_bisect_plus_one(f,
1997 le64toh(o->data.entry_offset),
1998 le64toh(o->data.entry_array_offset),
1999 le64toh(o->data.n_entries),
2001 test_object_monotonic,
2006 void journal_file_reset_location(JournalFile *f) {
2007 f->location_type = LOCATION_HEAD;
2008 f->current_offset = 0;
2009 f->current_seqnum = 0;
2010 f->current_realtime = 0;
2011 f->current_monotonic = 0;
2012 zero(f->current_boot_id);
2013 f->current_xor_hash = 0;
2016 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2017 f->location_type = LOCATION_SEEK;
2018 f->current_offset = offset;
2019 f->current_seqnum = le64toh(o->entry.seqnum);
2020 f->current_realtime = le64toh(o->entry.realtime);
2021 f->current_monotonic = le64toh(o->entry.monotonic);
2022 f->current_boot_id = o->entry.boot_id;
2023 f->current_xor_hash = le64toh(o->entry.xor_hash);
2026 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2029 assert(af->location_type == LOCATION_SEEK);
2030 assert(bf->location_type == LOCATION_SEEK);
2032 /* If contents and timestamps match, these entries are
2033 * identical, even if the seqnum does not match */
2034 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2035 af->current_monotonic == bf->current_monotonic &&
2036 af->current_realtime == bf->current_realtime &&
2037 af->current_xor_hash == bf->current_xor_hash)
2040 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2042 /* If this is from the same seqnum source, compare
2044 if (af->current_seqnum < bf->current_seqnum)
2046 if (af->current_seqnum > bf->current_seqnum)
2049 /* Wow! This is weird, different data but the same
2050 * seqnums? Something is borked, but let's make the
2051 * best of it and compare by time. */
2054 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2056 /* If the boot id matches, compare monotonic time */
2057 if (af->current_monotonic < bf->current_monotonic)
2059 if (af->current_monotonic > bf->current_monotonic)
2063 /* Otherwise, compare UTC time */
2064 if (af->current_realtime < bf->current_realtime)
2066 if (af->current_realtime > bf->current_realtime)
2069 /* Finally, compare by contents */
2070 if (af->current_xor_hash < bf->current_xor_hash)
2072 if (af->current_xor_hash > bf->current_xor_hash)
2078 int journal_file_next_entry(
2081 direction_t direction,
2082 Object **ret, uint64_t *offset) {
2089 n = le64toh(f->header->n_entries);
2094 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2096 r = generic_array_bisect(f,
2097 le64toh(f->header->entry_array_offset),
2098 le64toh(f->header->n_entries),
2107 if (direction == DIRECTION_DOWN) {
2120 /* And jump to it */
2121 r = generic_array_get(f,
2122 le64toh(f->header->entry_array_offset),
2129 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2130 log_debug("%s: entry array corrupted at entry %"PRIu64,
2141 int journal_file_next_entry_for_data(
2143 Object *o, uint64_t p,
2144 uint64_t data_offset,
2145 direction_t direction,
2146 Object **ret, uint64_t *offset) {
2153 assert(p > 0 || !o);
2155 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2159 n = le64toh(d->data.n_entries);
2164 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2166 if (o->object.type != OBJECT_ENTRY)
2169 r = generic_array_bisect_plus_one(f,
2170 le64toh(d->data.entry_offset),
2171 le64toh(d->data.entry_array_offset),
2172 le64toh(d->data.n_entries),
2182 if (direction == DIRECTION_DOWN) {
2196 return generic_array_get_plus_one(f,
2197 le64toh(d->data.entry_offset),
2198 le64toh(d->data.entry_array_offset),
2203 int journal_file_move_to_entry_by_offset_for_data(
2205 uint64_t data_offset,
2207 direction_t direction,
2208 Object **ret, uint64_t *offset) {
2215 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2219 return generic_array_bisect_plus_one(f,
2220 le64toh(d->data.entry_offset),
2221 le64toh(d->data.entry_array_offset),
2222 le64toh(d->data.n_entries),
2229 int journal_file_move_to_entry_by_monotonic_for_data(
2231 uint64_t data_offset,
2234 direction_t direction,
2235 Object **ret, uint64_t *offset) {
2243 /* First, seek by time */
2244 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2250 r = generic_array_bisect_plus_one(f,
2251 le64toh(o->data.entry_offset),
2252 le64toh(o->data.entry_array_offset),
2253 le64toh(o->data.n_entries),
2255 test_object_monotonic,
2261 /* And now, continue seeking until we find an entry that
2262 * exists in both bisection arrays */
2268 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2272 r = generic_array_bisect_plus_one(f,
2273 le64toh(d->data.entry_offset),
2274 le64toh(d->data.entry_array_offset),
2275 le64toh(d->data.n_entries),
2283 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2287 r = generic_array_bisect_plus_one(f,
2288 le64toh(o->data.entry_offset),
2289 le64toh(o->data.entry_array_offset),
2290 le64toh(o->data.n_entries),
2312 int journal_file_move_to_entry_by_seqnum_for_data(
2314 uint64_t data_offset,
2316 direction_t direction,
2317 Object **ret, uint64_t *offset) {
2324 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2328 return generic_array_bisect_plus_one(f,
2329 le64toh(d->data.entry_offset),
2330 le64toh(d->data.entry_array_offset),
2331 le64toh(d->data.n_entries),
2338 int journal_file_move_to_entry_by_realtime_for_data(
2340 uint64_t data_offset,
2342 direction_t direction,
2343 Object **ret, uint64_t *offset) {
2350 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2354 return generic_array_bisect_plus_one(f,
2355 le64toh(d->data.entry_offset),
2356 le64toh(d->data.entry_array_offset),
2357 le64toh(d->data.n_entries),
2359 test_object_realtime,
2364 void journal_file_dump(JournalFile *f) {
2371 journal_file_print_header(f);
2373 p = le64toh(f->header->header_size);
2375 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2379 switch (o->object.type) {
2382 printf("Type: OBJECT_UNUSED\n");
2386 printf("Type: OBJECT_DATA\n");
2390 printf("Type: OBJECT_FIELD\n");
2394 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2395 le64toh(o->entry.seqnum),
2396 le64toh(o->entry.monotonic),
2397 le64toh(o->entry.realtime));
2400 case OBJECT_FIELD_HASH_TABLE:
2401 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2404 case OBJECT_DATA_HASH_TABLE:
2405 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2408 case OBJECT_ENTRY_ARRAY:
2409 printf("Type: OBJECT_ENTRY_ARRAY\n");
2413 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2414 le64toh(o->tag.seqnum),
2415 le64toh(o->tag.epoch));
2419 printf("Type: unknown (%i)\n", o->object.type);
2423 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2424 printf("Flags: %s\n",
2425 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2427 if (p == le64toh(f->header->tail_object_offset))
2430 p = p + ALIGN64(le64toh(o->object.size));
2435 log_error("File corrupt");
2438 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2441 x = format_timestamp(buf, l, t);
2447 void journal_file_print_header(JournalFile *f) {
2448 char a[33], b[33], c[33], d[33];
2449 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2451 char bytes[FORMAT_BYTES_MAX];
2455 printf("File Path: %s\n"
2459 "Sequential Number ID: %s\n"
2461 "Compatible Flags:%s%s\n"
2462 "Incompatible Flags:%s%s%s\n"
2463 "Header size: %"PRIu64"\n"
2464 "Arena size: %"PRIu64"\n"
2465 "Data Hash Table Size: %"PRIu64"\n"
2466 "Field Hash Table Size: %"PRIu64"\n"
2467 "Rotate Suggested: %s\n"
2468 "Head Sequential Number: %"PRIu64"\n"
2469 "Tail Sequential Number: %"PRIu64"\n"
2470 "Head Realtime Timestamp: %s\n"
2471 "Tail Realtime Timestamp: %s\n"
2472 "Tail Monotonic Timestamp: %s\n"
2473 "Objects: %"PRIu64"\n"
2474 "Entry Objects: %"PRIu64"\n",
2476 sd_id128_to_string(f->header->file_id, a),
2477 sd_id128_to_string(f->header->machine_id, b),
2478 sd_id128_to_string(f->header->boot_id, c),
2479 sd_id128_to_string(f->header->seqnum_id, d),
2480 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2481 f->header->state == STATE_ONLINE ? "ONLINE" :
2482 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2483 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2484 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2485 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2486 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2487 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2488 le64toh(f->header->header_size),
2489 le64toh(f->header->arena_size),
2490 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2491 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2492 yes_no(journal_file_rotate_suggested(f, 0)),
2493 le64toh(f->header->head_entry_seqnum),
2494 le64toh(f->header->tail_entry_seqnum),
2495 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2496 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2497 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2498 le64toh(f->header->n_objects),
2499 le64toh(f->header->n_entries));
2501 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2502 printf("Data Objects: %"PRIu64"\n"
2503 "Data Hash Table Fill: %.1f%%\n",
2504 le64toh(f->header->n_data),
2505 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2507 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2508 printf("Field Objects: %"PRIu64"\n"
2509 "Field Hash Table Fill: %.1f%%\n",
2510 le64toh(f->header->n_fields),
2511 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2513 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2514 printf("Tag Objects: %"PRIu64"\n",
2515 le64toh(f->header->n_tags));
2516 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2517 printf("Entry Array Objects: %"PRIu64"\n",
2518 le64toh(f->header->n_entry_arrays));
2520 if (fstat(f->fd, &st) >= 0)
2521 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2524 int journal_file_open(
2530 JournalMetrics *metrics,
2531 MMapCache *mmap_cache,
2532 JournalFile *template,
2533 JournalFile **ret) {
2535 bool newly_created = false;
2543 if ((flags & O_ACCMODE) != O_RDONLY &&
2544 (flags & O_ACCMODE) != O_RDWR)
2547 if (!endswith(fname, ".journal") &&
2548 !endswith(fname, ".journal~"))
2551 f = new0(JournalFile, 1);
2559 f->prot = prot_from_flags(flags);
2560 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2561 #if defined(HAVE_LZ4)
2562 f->compress_lz4 = compress;
2563 #elif defined(HAVE_XZ)
2564 f->compress_xz = compress;
2571 f->mmap = mmap_cache_ref(mmap_cache);
2573 f->mmap = mmap_cache_new();
2580 f->path = strdup(fname);
2586 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2587 if (!f->chain_cache) {
2592 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2598 r = journal_file_fstat(f);
2602 if (f->last_stat.st_size == 0 && f->writable) {
2604 /* Before we write anything, turn off COW logic. Given
2605 * our write pattern that is quite unfriendly to COW
2606 * file systems this should greatly improve
2607 * performance on COW file systems, such as btrfs, at
2608 * the expense of data integrity features (which
2609 * shouldn't be too bad, given that we do our own
2611 r = chattr_fd(f->fd, true, FS_NOCOW_FL);
2612 if (r < 0 && r != -ENOTTY)
2613 log_warning_errno(r, "Failed to set file attributes: %m");
2615 /* Let's attach the creation time to the journal file,
2616 * so that the vacuuming code knows the age of this
2617 * file even if the file might end up corrupted one
2618 * day... Ideally we'd just use the creation time many
2619 * file systems maintain for each file, but there is
2620 * currently no usable API to query this, hence let's
2621 * emulate this via extended attributes. If extended
2622 * attributes are not supported we'll just skip this,
2623 * and rely solely on mtime/atime/ctime of the file. */
2625 fd_setcrtime(f->fd, 0);
2628 /* Try to load the FSPRG state, and if we can't, then
2629 * just don't do sealing */
2631 r = journal_file_fss_load(f);
2637 r = journal_file_init_header(f, template);
2641 r = journal_file_fstat(f);
2645 newly_created = true;
2648 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2653 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2659 if (!newly_created) {
2660 r = journal_file_verify_header(f);
2666 if (!newly_created && f->writable) {
2667 r = journal_file_fss_load(f);
2675 journal_default_metrics(metrics, f->fd);
2676 f->metrics = *metrics;
2677 } else if (template)
2678 f->metrics = template->metrics;
2680 r = journal_file_refresh_header(f);
2686 r = journal_file_hmac_setup(f);
2691 if (newly_created) {
2692 r = journal_file_setup_field_hash_table(f);
2696 r = journal_file_setup_data_hash_table(f);
2701 r = journal_file_append_first_tag(f);
2707 r = journal_file_map_field_hash_table(f);
2711 r = journal_file_map_data_hash_table(f);
2715 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2724 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2727 journal_file_close(f);
2732 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2733 _cleanup_free_ char *p = NULL;
2735 JournalFile *old_file, *new_file = NULL;
2743 if (!old_file->writable)
2746 if (!endswith(old_file->path, ".journal"))
2749 l = strlen(old_file->path);
2750 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2751 (int) l - 8, old_file->path,
2752 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2753 le64toh((*f)->header->head_entry_seqnum),
2754 le64toh((*f)->header->head_entry_realtime));
2758 /* Try to rename the file to the archived version. If the file
2759 * already was deleted, we'll get ENOENT, let's ignore that
2761 r = rename(old_file->path, p);
2762 if (r < 0 && errno != ENOENT)
2765 old_file->header->state = STATE_ARCHIVED;
2767 /* Currently, btrfs is not very good with out write patterns
2768 * and fragments heavily. Let's defrag our journal files when
2769 * we archive them */
2770 old_file->defrag_on_close = true;
2772 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2773 journal_file_close(old_file);
2779 int journal_file_open_reliably(
2785 JournalMetrics *metrics,
2786 MMapCache *mmap_cache,
2787 JournalFile *template,
2788 JournalFile **ret) {
2792 _cleanup_free_ char *p = NULL;
2794 r = journal_file_open(fname, flags, mode, compress, seal,
2795 metrics, mmap_cache, template, ret);
2797 -EBADMSG, /* corrupted */
2798 -ENODATA, /* truncated */
2799 -EHOSTDOWN, /* other machine */
2800 -EPROTONOSUPPORT, /* incompatible feature */
2801 -EBUSY, /* unclean shutdown */
2802 -ESHUTDOWN, /* already archived */
2803 -EIO, /* IO error, including SIGBUS on mmap */
2804 -EIDRM /* File has been deleted */))
2807 if ((flags & O_ACCMODE) == O_RDONLY)
2810 if (!(flags & O_CREAT))
2813 if (!endswith(fname, ".journal"))
2816 /* The file is corrupted. Rotate it away and try it again (but only once) */
2819 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
2821 now(CLOCK_REALTIME),
2825 r = rename(fname, p);
2829 /* btrfs doesn't cope well with our write pattern and
2830 * fragments heavily. Let's defrag all files we rotate */
2832 (void) chattr_path(p, false, FS_NOCOW_FL);
2833 (void) btrfs_defrag(p);
2835 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2837 return journal_file_open(fname, flags, mode, compress, seal,
2838 metrics, mmap_cache, template, ret);
2841 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2843 uint64_t q, xor_hash = 0;
2856 ts.monotonic = le64toh(o->entry.monotonic);
2857 ts.realtime = le64toh(o->entry.realtime);
2859 n = journal_file_entry_n_items(o);
2860 /* alloca() can't take 0, hence let's allocate at least one */
2861 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2863 for (i = 0; i < n; i++) {
2870 q = le64toh(o->entry.items[i].object_offset);
2871 le_hash = o->entry.items[i].hash;
2873 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2877 if (le_hash != o->data.hash)
2880 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2883 /* We hit the limit on 32bit machines */
2884 if ((uint64_t) t != l)
2887 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2888 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2891 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2892 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2896 data = from->compress_buffer;
2899 return -EPROTONOSUPPORT;
2902 data = o->data.payload;
2904 r = journal_file_append_data(to, data, l, &u, &h);
2908 xor_hash ^= le64toh(u->data.hash);
2909 items[i].object_offset = htole64(h);
2910 items[i].hash = u->data.hash;
2912 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2917 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2919 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2925 void journal_default_metrics(JournalMetrics *m, int fd) {
2926 uint64_t fs_size = 0;
2928 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2933 if (fstatvfs(fd, &ss) >= 0)
2934 fs_size = ss.f_frsize * ss.f_blocks;
2936 if (m->max_use == (uint64_t) -1) {
2939 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2941 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2942 m->max_use = DEFAULT_MAX_USE_UPPER;
2944 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2945 m->max_use = DEFAULT_MAX_USE_LOWER;
2947 m->max_use = DEFAULT_MAX_USE_LOWER;
2949 m->max_use = PAGE_ALIGN(m->max_use);
2951 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2952 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2955 if (m->max_size == (uint64_t) -1) {
2956 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2958 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2959 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2961 m->max_size = PAGE_ALIGN(m->max_size);
2963 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2964 m->max_size = JOURNAL_FILE_SIZE_MIN;
2966 if (m->max_size*2 > m->max_use)
2967 m->max_use = m->max_size*2;
2969 if (m->min_size == (uint64_t) -1)
2970 m->min_size = JOURNAL_FILE_SIZE_MIN;
2972 m->min_size = PAGE_ALIGN(m->min_size);
2974 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2975 m->min_size = JOURNAL_FILE_SIZE_MIN;
2977 if (m->min_size > m->max_size)
2978 m->max_size = m->min_size;
2981 if (m->keep_free == (uint64_t) -1) {
2984 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2986 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2987 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2990 m->keep_free = DEFAULT_KEEP_FREE;
2993 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2994 format_bytes(a, sizeof(a), m->max_use),
2995 format_bytes(b, sizeof(b), m->max_size),
2996 format_bytes(c, sizeof(c), m->min_size),
2997 format_bytes(d, sizeof(d), m->keep_free));
3000 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3005 if (f->header->head_entry_realtime == 0)
3008 *from = le64toh(f->header->head_entry_realtime);
3012 if (f->header->tail_entry_realtime == 0)
3015 *to = le64toh(f->header->tail_entry_realtime);
3021 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3029 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3033 if (le64toh(o->data.n_entries) <= 0)
3037 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3041 *from = le64toh(o->entry.monotonic);
3045 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3049 r = generic_array_get_plus_one(f,
3050 le64toh(o->data.entry_offset),
3051 le64toh(o->data.entry_array_offset),
3052 le64toh(o->data.n_entries)-1,
3057 *to = le64toh(o->entry.monotonic);
3063 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3066 /* If we gained new header fields we gained new features,
3067 * hence suggest a rotation */
3068 if (le64toh(f->header->header_size) < sizeof(Header)) {
3069 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3073 /* Let's check if the hash tables grew over a certain fill
3074 * level (75%, borrowing this value from Java's hash table
3075 * implementation), and if so suggest a rotation. To calculate
3076 * the fill level we need the n_data field, which only exists
3077 * in newer versions. */
3079 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3080 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3081 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3083 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3084 le64toh(f->header->n_data),
3085 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3086 (unsigned long long) f->last_stat.st_size,
3087 f->last_stat.st_size / le64toh(f->header->n_data));
3091 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3092 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3093 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3095 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3096 le64toh(f->header->n_fields),
3097 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3101 /* Are the data objects properly indexed by field objects? */
3102 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3103 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3104 le64toh(f->header->n_data) > 0 &&
3105 le64toh(f->header->n_fields) == 0)
3108 if (max_file_usec > 0) {
3111 h = le64toh(f->header->head_entry_realtime);
3112 t = now(CLOCK_REALTIME);
3114 if (h > 0 && t > h + max_file_usec)