1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70 /* The mmap context to use for the header we pick as one above the last defined typed */
71 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
73 static int journal_file_set_online(JournalFile *f) {
79 if (!(f->fd >= 0 && f->header))
82 if (mmap_cache_got_sigbus(f->mmap, f->fd))
85 switch(f->header->state) {
90 f->header->state = STATE_ONLINE;
99 int journal_file_set_offline(JournalFile *f) {
105 if (!(f->fd >= 0 && f->header))
108 if (f->header->state != STATE_ONLINE)
113 if (mmap_cache_got_sigbus(f->mmap, f->fd))
116 f->header->state = STATE_OFFLINE;
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
126 void journal_file_close(JournalFile *f) {
130 /* Write the final tag */
131 if (f->seal && f->writable)
132 journal_file_append_tag(f);
135 journal_file_set_offline(f);
137 if (f->mmap && f->fd >= 0)
138 mmap_cache_close_fd(f->mmap, f->fd);
144 mmap_cache_unref(f->mmap);
146 ordered_hashmap_free_free(f->chain_cache);
148 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
149 free(f->compress_buffer);
154 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
155 else if (f->fsprg_state)
156 free(f->fsprg_state);
161 gcry_md_close(f->hmac);
167 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
174 memcpy(h.signature, HEADER_SIGNATURE, 8);
175 h.header_size = htole64(ALIGN64(sizeof(h)));
177 h.incompatible_flags |= htole32(
178 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
179 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
181 h.compatible_flags = htole32(
182 f->seal * HEADER_COMPATIBLE_SEALED);
184 r = sd_id128_randomize(&h.file_id);
189 h.seqnum_id = template->header->seqnum_id;
190 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
192 h.seqnum_id = h.file_id;
194 k = pwrite(f->fd, &h, sizeof(h), 0);
204 static int journal_file_refresh_header(JournalFile *f) {
210 r = sd_id128_get_machine(&f->header->machine_id);
214 r = sd_id128_get_boot(&boot_id);
218 if (sd_id128_equal(boot_id, f->header->boot_id))
219 f->tail_entry_monotonic_valid = true;
221 f->header->boot_id = boot_id;
223 r = journal_file_set_online(f);
225 /* Sync the online state to disk */
231 static int journal_file_verify_header(JournalFile *f) {
236 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
239 /* In both read and write mode we refuse to open files with
240 * incompatible flags we don't know */
241 flags = le32toh(f->header->incompatible_flags);
242 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
243 if (flags & ~HEADER_INCOMPATIBLE_ANY)
244 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
245 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
246 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
248 log_debug("Journal file %s uses incompatible flags %"PRIx32
249 " disabled at compilation time.", f->path, flags);
250 return -EPROTONOSUPPORT;
253 /* When open for writing we refuse to open files with
254 * compatible flags, too */
255 flags = le32toh(f->header->compatible_flags);
256 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
257 if (flags & ~HEADER_COMPATIBLE_ANY)
258 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
259 f->path, flags & ~HEADER_COMPATIBLE_ANY);
260 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
262 log_debug("Journal file %s uses compatible flags %"PRIx32
263 " disabled at compilation time.", f->path, flags);
264 return -EPROTONOSUPPORT;
267 if (f->header->state >= _STATE_MAX)
270 /* The first addition was n_data, so check that we are at least this large */
271 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
274 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
277 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
280 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
283 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
284 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
285 !VALID64(le64toh(f->header->tail_object_offset)) ||
286 !VALID64(le64toh(f->header->entry_array_offset)))
291 sd_id128_t machine_id;
294 r = sd_id128_get_machine(&machine_id);
298 if (!sd_id128_equal(machine_id, f->header->machine_id))
301 state = f->header->state;
303 if (state == STATE_ONLINE) {
304 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
306 } else if (state == STATE_ARCHIVED)
308 else if (state != STATE_OFFLINE) {
309 log_debug("Journal file %s has unknown state %u.", f->path, state);
314 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
315 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
317 f->seal = JOURNAL_HEADER_SEALED(f->header);
322 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
323 uint64_t old_size, new_size;
328 /* We assume that this file is not sparse, and we know that
329 * for sure, since we always call posix_fallocate()
332 if (mmap_cache_got_sigbus(f->mmap, f->fd))
336 le64toh(f->header->header_size) +
337 le64toh(f->header->arena_size);
339 new_size = PAGE_ALIGN(offset + size);
340 if (new_size < le64toh(f->header->header_size))
341 new_size = le64toh(f->header->header_size);
343 if (new_size <= old_size)
346 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
349 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
352 if (fstatvfs(f->fd, &svfs) >= 0) {
355 available = svfs.f_bfree * svfs.f_bsize;
357 if (available >= f->metrics.keep_free)
358 available -= f->metrics.keep_free;
362 if (new_size - old_size > available)
367 /* Increase by larger blocks at once */
368 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
370 new_size = f->metrics.max_size;
372 /* Note that the glibc fallocate() fallback is very
373 inefficient, hence we try to minimize the allocation area
375 r = posix_fallocate(f->fd, old_size, new_size - old_size);
379 if (fstat(f->fd, &f->last_stat) < 0)
382 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
387 static unsigned type_to_context(ObjectType type) {
388 /* One context for each type, plus one catch-all for the rest */
389 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
390 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
391 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
394 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
401 /* Avoid SIGBUS on invalid accesses */
402 if (offset + size > (uint64_t) f->last_stat.st_size) {
403 /* Hmm, out of range? Let's refresh the fstat() data
404 * first, before we trust that check. */
406 if (fstat(f->fd, &f->last_stat) < 0 ||
407 offset + size > (uint64_t) f->last_stat.st_size)
408 return -EADDRNOTAVAIL;
411 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
414 static uint64_t minimum_header_size(Object *o) {
416 static const uint64_t table[] = {
417 [OBJECT_DATA] = sizeof(DataObject),
418 [OBJECT_FIELD] = sizeof(FieldObject),
419 [OBJECT_ENTRY] = sizeof(EntryObject),
420 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
421 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
422 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
423 [OBJECT_TAG] = sizeof(TagObject),
426 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
427 return sizeof(ObjectHeader);
429 return table[o->object.type];
432 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
441 /* Objects may only be located at multiple of 64 bit */
442 if (!VALID64(offset))
445 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
450 s = le64toh(o->object.size);
452 if (s < sizeof(ObjectHeader))
455 if (o->object.type <= OBJECT_UNUSED)
458 if (s < minimum_header_size(o))
461 if (type > OBJECT_UNUSED && o->object.type != type)
464 if (s > sizeof(ObjectHeader)) {
465 r = journal_file_move_to(f, type, false, offset, s, &t);
476 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
481 r = le64toh(f->header->tail_entry_seqnum) + 1;
484 /* If an external seqnum counter was passed, we update
485 * both the local and the external one, and set it to
486 * the maximum of both */
494 f->header->tail_entry_seqnum = htole64(r);
496 if (f->header->head_entry_seqnum == 0)
497 f->header->head_entry_seqnum = htole64(r);
502 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
509 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
510 assert(size >= sizeof(ObjectHeader));
514 r = journal_file_set_online(f);
518 p = le64toh(f->header->tail_object_offset);
520 p = le64toh(f->header->header_size);
522 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
526 p += ALIGN64(le64toh(tail->object.size));
529 r = journal_file_allocate(f, p, size);
533 r = journal_file_move_to(f, type, false, p, size, &t);
540 o->object.type = type;
541 o->object.size = htole64(size);
543 f->header->tail_object_offset = htole64(p);
544 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
552 static int journal_file_setup_data_hash_table(JournalFile *f) {
559 /* We estimate that we need 1 hash table entry per 768 of
560 journal file and we want to make sure we never get beyond
561 75% fill level. Calculate the hash table size for the
562 maximum file size based on these metrics. */
564 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
565 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
566 s = DEFAULT_DATA_HASH_TABLE_SIZE;
568 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
570 r = journal_file_append_object(f,
571 OBJECT_DATA_HASH_TABLE,
572 offsetof(Object, hash_table.items) + s,
577 memzero(o->hash_table.items, s);
579 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
580 f->header->data_hash_table_size = htole64(s);
585 static int journal_file_setup_field_hash_table(JournalFile *f) {
592 /* We use a fixed size hash table for the fields as this
593 * number should grow very slowly only */
595 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
596 r = journal_file_append_object(f,
597 OBJECT_FIELD_HASH_TABLE,
598 offsetof(Object, hash_table.items) + s,
603 memzero(o->hash_table.items, s);
605 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
606 f->header->field_hash_table_size = htole64(s);
611 static int journal_file_map_data_hash_table(JournalFile *f) {
618 p = le64toh(f->header->data_hash_table_offset);
619 s = le64toh(f->header->data_hash_table_size);
621 r = journal_file_move_to(f,
622 OBJECT_DATA_HASH_TABLE,
629 f->data_hash_table = t;
633 static int journal_file_map_field_hash_table(JournalFile *f) {
640 p = le64toh(f->header->field_hash_table_offset);
641 s = le64toh(f->header->field_hash_table_size);
643 r = journal_file_move_to(f,
644 OBJECT_FIELD_HASH_TABLE,
651 f->field_hash_table = t;
655 static int journal_file_link_field(
668 if (o->object.type != OBJECT_FIELD)
671 /* This might alter the window we are looking at */
673 o->field.next_hash_offset = o->field.head_data_offset = 0;
675 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
676 p = le64toh(f->field_hash_table[h].tail_hash_offset);
678 f->field_hash_table[h].head_hash_offset = htole64(offset);
680 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
684 o->field.next_hash_offset = htole64(offset);
687 f->field_hash_table[h].tail_hash_offset = htole64(offset);
689 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
690 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
695 static int journal_file_link_data(
708 if (o->object.type != OBJECT_DATA)
711 /* This might alter the window we are looking at */
713 o->data.next_hash_offset = o->data.next_field_offset = 0;
714 o->data.entry_offset = o->data.entry_array_offset = 0;
715 o->data.n_entries = 0;
717 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
718 p = le64toh(f->data_hash_table[h].tail_hash_offset);
720 /* Only entry in the hash table is easy */
721 f->data_hash_table[h].head_hash_offset = htole64(offset);
723 /* Move back to the previous data object, to patch in
726 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
730 o->data.next_hash_offset = htole64(offset);
733 f->data_hash_table[h].tail_hash_offset = htole64(offset);
735 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
736 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
741 int journal_file_find_field_object_with_hash(
743 const void *field, uint64_t size, uint64_t hash,
744 Object **ret, uint64_t *offset) {
746 uint64_t p, osize, h;
750 assert(field && size > 0);
752 osize = offsetof(Object, field.payload) + size;
754 if (f->header->field_hash_table_size == 0)
757 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
758 p = le64toh(f->field_hash_table[h].head_hash_offset);
763 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
767 if (le64toh(o->field.hash) == hash &&
768 le64toh(o->object.size) == osize &&
769 memcmp(o->field.payload, field, size) == 0) {
779 p = le64toh(o->field.next_hash_offset);
785 int journal_file_find_field_object(
787 const void *field, uint64_t size,
788 Object **ret, uint64_t *offset) {
793 assert(field && size > 0);
795 hash = hash64(field, size);
797 return journal_file_find_field_object_with_hash(f,
802 int journal_file_find_data_object_with_hash(
804 const void *data, uint64_t size, uint64_t hash,
805 Object **ret, uint64_t *offset) {
807 uint64_t p, osize, h;
811 assert(data || size == 0);
813 osize = offsetof(Object, data.payload) + size;
815 if (f->header->data_hash_table_size == 0)
818 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
819 p = le64toh(f->data_hash_table[h].head_hash_offset);
824 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
828 if (le64toh(o->data.hash) != hash)
831 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
832 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
836 l = le64toh(o->object.size);
837 if (l <= offsetof(Object, data.payload))
840 l -= offsetof(Object, data.payload);
842 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
843 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
848 memcmp(f->compress_buffer, data, size) == 0) {
859 return -EPROTONOSUPPORT;
861 } else if (le64toh(o->object.size) == osize &&
862 memcmp(o->data.payload, data, size) == 0) {
874 p = le64toh(o->data.next_hash_offset);
880 int journal_file_find_data_object(
882 const void *data, uint64_t size,
883 Object **ret, uint64_t *offset) {
888 assert(data || size == 0);
890 hash = hash64(data, size);
892 return journal_file_find_data_object_with_hash(f,
897 static int journal_file_append_field(
899 const void *field, uint64_t size,
900 Object **ret, uint64_t *offset) {
908 assert(field && size > 0);
910 hash = hash64(field, size);
912 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
926 osize = offsetof(Object, field.payload) + size;
927 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
931 o->field.hash = htole64(hash);
932 memcpy(o->field.payload, field, size);
934 r = journal_file_link_field(f, o, p, hash);
938 /* The linking might have altered the window, so let's
939 * refresh our pointer */
940 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
945 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
959 static int journal_file_append_data(
961 const void *data, uint64_t size,
962 Object **ret, uint64_t *offset) {
967 int r, compression = 0;
971 assert(data || size == 0);
973 hash = hash64(data, size);
975 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
989 osize = offsetof(Object, data.payload) + size;
990 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
994 o->data.hash = htole64(hash);
996 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
997 if (f->compress_xz &&
998 size >= COMPRESSION_SIZE_THRESHOLD) {
1001 compression = compress_blob(data, size, o->data.payload, &rsize);
1004 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1005 o->object.flags |= compression;
1007 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1008 size, rsize, object_compressed_to_string(compression));
1013 if (!compression && size > 0)
1014 memcpy(o->data.payload, data, size);
1016 r = journal_file_link_data(f, o, p, hash);
1020 /* The linking might have altered the window, so let's
1021 * refresh our pointer */
1022 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1029 eq = memchr(data, '=', size);
1030 if (eq && eq > data) {
1034 /* Create field object ... */
1035 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1039 /* ... and link it in. */
1040 o->data.next_field_offset = fo->field.head_data_offset;
1041 fo->field.head_data_offset = le64toh(p);
1045 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1059 uint64_t journal_file_entry_n_items(Object *o) {
1062 if (o->object.type != OBJECT_ENTRY)
1065 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1068 uint64_t journal_file_entry_array_n_items(Object *o) {
1071 if (o->object.type != OBJECT_ENTRY_ARRAY)
1074 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1077 uint64_t journal_file_hash_table_n_items(Object *o) {
1080 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1081 o->object.type != OBJECT_FIELD_HASH_TABLE)
1084 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1087 static int link_entry_into_array(JournalFile *f,
1092 uint64_t n = 0, ap = 0, q, i, a, hidx;
1100 a = le64toh(*first);
1101 i = hidx = le64toh(*idx);
1104 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1108 n = journal_file_entry_array_n_items(o);
1110 o->entry_array.items[i] = htole64(p);
1111 *idx = htole64(hidx + 1);
1117 a = le64toh(o->entry_array.next_entry_array_offset);
1128 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1129 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1135 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1140 o->entry_array.items[i] = htole64(p);
1143 *first = htole64(q);
1145 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1149 o->entry_array.next_entry_array_offset = htole64(q);
1152 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1153 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1155 *idx = htole64(hidx + 1);
1160 static int link_entry_into_array_plus_one(JournalFile *f,
1175 *extra = htole64(p);
1179 i = htole64(le64toh(*idx) - 1);
1180 r = link_entry_into_array(f, first, &i, p);
1185 *idx = htole64(le64toh(*idx) + 1);
1189 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1196 p = le64toh(o->entry.items[i].object_offset);
1200 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1204 return link_entry_into_array_plus_one(f,
1205 &o->data.entry_offset,
1206 &o->data.entry_array_offset,
1211 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1219 if (o->object.type != OBJECT_ENTRY)
1222 __sync_synchronize();
1224 /* Link up the entry itself */
1225 r = link_entry_into_array(f,
1226 &f->header->entry_array_offset,
1227 &f->header->n_entries,
1232 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1234 if (f->header->head_entry_realtime == 0)
1235 f->header->head_entry_realtime = o->entry.realtime;
1237 f->header->tail_entry_realtime = o->entry.realtime;
1238 f->header->tail_entry_monotonic = o->entry.monotonic;
1240 f->tail_entry_monotonic_valid = true;
1242 /* Link up the items */
1243 n = journal_file_entry_n_items(o);
1244 for (i = 0; i < n; i++) {
1245 r = journal_file_link_entry_item(f, o, offset, i);
1253 static int journal_file_append_entry_internal(
1255 const dual_timestamp *ts,
1257 const EntryItem items[], unsigned n_items,
1259 Object **ret, uint64_t *offset) {
1266 assert(items || n_items == 0);
1269 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1271 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1275 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1276 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1277 o->entry.realtime = htole64(ts->realtime);
1278 o->entry.monotonic = htole64(ts->monotonic);
1279 o->entry.xor_hash = htole64(xor_hash);
1280 o->entry.boot_id = f->header->boot_id;
1283 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1288 r = journal_file_link_entry(f, o, np);
1301 void journal_file_post_change(JournalFile *f) {
1304 /* inotify() does not receive IN_MODIFY events from file
1305 * accesses done via mmap(). After each access we hence
1306 * trigger IN_MODIFY by truncating the journal file to its
1307 * current size which triggers IN_MODIFY. */
1309 __sync_synchronize();
1311 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1312 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1315 static int entry_item_cmp(const void *_a, const void *_b) {
1316 const EntryItem *a = _a, *b = _b;
1318 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1320 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1325 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1329 uint64_t xor_hash = 0;
1330 struct dual_timestamp _ts;
1333 assert(iovec || n_iovec == 0);
1336 dual_timestamp_get(&_ts);
1340 if (f->tail_entry_monotonic_valid &&
1341 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1345 r = journal_file_maybe_append_tag(f, ts->realtime);
1350 /* alloca() can't take 0, hence let's allocate at least one */
1351 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1353 for (i = 0; i < n_iovec; i++) {
1357 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1361 xor_hash ^= le64toh(o->data.hash);
1362 items[i].object_offset = htole64(p);
1363 items[i].hash = o->data.hash;
1366 /* Order by the position on disk, in order to improve seek
1367 * times for rotating media. */
1368 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1370 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1372 /* If the memory mapping triggered a SIGBUS then we return an
1373 * IO error and ignore the error code passed down to us, since
1374 * it is very likely just an effect of a nullified replacement
1377 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1380 journal_file_post_change(f);
1385 typedef struct ChainCacheItem {
1386 uint64_t first; /* the array at the beginning of the chain */
1387 uint64_t array; /* the cached array */
1388 uint64_t begin; /* the first item in the cached array */
1389 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1390 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1393 static void chain_cache_put(
1400 uint64_t last_index) {
1403 /* If the chain item to cache for this chain is the
1404 * first one it's not worth caching anything */
1408 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1409 ci = ordered_hashmap_steal_first(h);
1412 ci = new(ChainCacheItem, 1);
1419 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1424 assert(ci->first == first);
1429 ci->last_index = last_index;
1432 static int generic_array_get(
1436 Object **ret, uint64_t *offset) {
1439 uint64_t p = 0, a, t = 0;
1447 /* Try the chain cache first */
1448 ci = ordered_hashmap_get(f->chain_cache, &first);
1449 if (ci && i > ci->total) {
1458 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1462 k = journal_file_entry_array_n_items(o);
1464 p = le64toh(o->entry_array.items[i]);
1470 a = le64toh(o->entry_array.next_entry_array_offset);
1476 /* Let's cache this item for the next invocation */
1477 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1479 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1492 static int generic_array_get_plus_one(
1497 Object **ret, uint64_t *offset) {
1506 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1519 return generic_array_get(f, first, i-1, ret, offset);
1528 static int generic_array_bisect(
1533 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1534 direction_t direction,
1539 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1540 bool subtract_one = false;
1541 Object *o, *array = NULL;
1546 assert(test_object);
1548 /* Start with the first array in the chain */
1551 ci = ordered_hashmap_get(f->chain_cache, &first);
1552 if (ci && n > ci->total) {
1553 /* Ah, we have iterated this bisection array chain
1554 * previously! Let's see if we can skip ahead in the
1555 * chain, as far as the last time. But we can't jump
1556 * backwards in the chain, so let's check that
1559 r = test_object(f, ci->begin, needle);
1563 if (r == TEST_LEFT) {
1564 /* OK, what we are looking for is right of the
1565 * begin of this EntryArray, so let's jump
1566 * straight to previously cached array in the
1572 last_index = ci->last_index;
1577 uint64_t left, right, k, lp;
1579 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1583 k = journal_file_entry_array_n_items(array);
1589 lp = p = le64toh(array->entry_array.items[i]);
1593 r = test_object(f, p, needle);
1597 if (r == TEST_FOUND)
1598 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1600 if (r == TEST_RIGHT) {
1604 if (last_index != (uint64_t) -1) {
1605 assert(last_index <= right);
1607 /* If we cached the last index we
1608 * looked at, let's try to not to jump
1609 * too wildly around and see if we can
1610 * limit the range to look at early to
1611 * the immediate neighbors of the last
1612 * index we looked at. */
1614 if (last_index > 0) {
1615 uint64_t x = last_index - 1;
1617 p = le64toh(array->entry_array.items[x]);
1621 r = test_object(f, p, needle);
1625 if (r == TEST_FOUND)
1626 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1628 if (r == TEST_RIGHT)
1634 if (last_index < right) {
1635 uint64_t y = last_index + 1;
1637 p = le64toh(array->entry_array.items[y]);
1641 r = test_object(f, p, needle);
1645 if (r == TEST_FOUND)
1646 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1648 if (r == TEST_RIGHT)
1656 if (left == right) {
1657 if (direction == DIRECTION_UP)
1658 subtract_one = true;
1664 assert(left < right);
1665 i = (left + right) / 2;
1667 p = le64toh(array->entry_array.items[i]);
1671 r = test_object(f, p, needle);
1675 if (r == TEST_FOUND)
1676 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1678 if (r == TEST_RIGHT)
1686 if (direction == DIRECTION_UP) {
1688 subtract_one = true;
1699 last_index = (uint64_t) -1;
1700 a = le64toh(array->entry_array.next_entry_array_offset);
1706 if (subtract_one && t == 0 && i == 0)
1709 /* Let's cache this item for the next invocation */
1710 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1712 if (subtract_one && i == 0)
1714 else if (subtract_one)
1715 p = le64toh(array->entry_array.items[i-1]);
1717 p = le64toh(array->entry_array.items[i]);
1719 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1730 *idx = t + i + (subtract_one ? -1 : 0);
1735 static int generic_array_bisect_plus_one(
1741 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1742 direction_t direction,
1748 bool step_back = false;
1752 assert(test_object);
1757 /* This bisects the array in object 'first', but first checks
1759 r = test_object(f, extra, needle);
1763 if (r == TEST_FOUND)
1764 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1766 /* if we are looking with DIRECTION_UP then we need to first
1767 see if in the actual array there is a matching entry, and
1768 return the last one of that. But if there isn't any we need
1769 to return this one. Hence remember this, and return it
1772 step_back = direction == DIRECTION_UP;
1774 if (r == TEST_RIGHT) {
1775 if (direction == DIRECTION_DOWN)
1781 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1783 if (r == 0 && step_back)
1792 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1808 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1814 else if (p < needle)
1820 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1827 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1831 if (le64toh(o->entry.seqnum) == needle)
1833 else if (le64toh(o->entry.seqnum) < needle)
1839 int journal_file_move_to_entry_by_seqnum(
1842 direction_t direction,
1846 return generic_array_bisect(f,
1847 le64toh(f->header->entry_array_offset),
1848 le64toh(f->header->n_entries),
1855 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1862 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1866 if (le64toh(o->entry.realtime) == needle)
1868 else if (le64toh(o->entry.realtime) < needle)
1874 int journal_file_move_to_entry_by_realtime(
1877 direction_t direction,
1881 return generic_array_bisect(f,
1882 le64toh(f->header->entry_array_offset),
1883 le64toh(f->header->n_entries),
1885 test_object_realtime,
1890 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1897 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1901 if (le64toh(o->entry.monotonic) == needle)
1903 else if (le64toh(o->entry.monotonic) < needle)
1909 static inline int find_data_object_by_boot_id(
1914 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1916 sd_id128_to_string(boot_id, t + 9);
1917 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1920 int journal_file_move_to_entry_by_monotonic(
1924 direction_t direction,
1933 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1939 return generic_array_bisect_plus_one(f,
1940 le64toh(o->data.entry_offset),
1941 le64toh(o->data.entry_array_offset),
1942 le64toh(o->data.n_entries),
1944 test_object_monotonic,
1949 void journal_file_reset_location(JournalFile *f) {
1950 f->location_type = LOCATION_HEAD;
1951 f->current_offset = 0;
1952 f->current_seqnum = 0;
1953 f->current_realtime = 0;
1954 f->current_monotonic = 0;
1955 zero(f->current_boot_id);
1956 f->current_xor_hash = 0;
1959 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1960 f->last_direction = direction;
1961 f->location_type = LOCATION_SEEK;
1962 f->current_offset = offset;
1963 f->current_seqnum = le64toh(o->entry.seqnum);
1964 f->current_realtime = le64toh(o->entry.realtime);
1965 f->current_monotonic = le64toh(o->entry.monotonic);
1966 f->current_boot_id = o->entry.boot_id;
1967 f->current_xor_hash = le64toh(o->entry.xor_hash);
1970 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1973 assert(af->location_type == LOCATION_SEEK);
1974 assert(bf->location_type == LOCATION_SEEK);
1976 /* If contents and timestamps match, these entries are
1977 * identical, even if the seqnum does not match */
1978 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1979 af->current_monotonic == bf->current_monotonic &&
1980 af->current_realtime == bf->current_realtime &&
1981 af->current_xor_hash == bf->current_xor_hash)
1984 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1986 /* If this is from the same seqnum source, compare
1988 if (af->current_seqnum < bf->current_seqnum)
1990 if (af->current_seqnum > bf->current_seqnum)
1993 /* Wow! This is weird, different data but the same
1994 * seqnums? Something is borked, but let's make the
1995 * best of it and compare by time. */
1998 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2000 /* If the boot id matches, compare monotonic time */
2001 if (af->current_monotonic < bf->current_monotonic)
2003 if (af->current_monotonic > bf->current_monotonic)
2007 /* Otherwise, compare UTC time */
2008 if (af->current_realtime < bf->current_realtime)
2010 if (af->current_realtime > bf->current_realtime)
2013 /* Finally, compare by contents */
2014 if (af->current_xor_hash < bf->current_xor_hash)
2016 if (af->current_xor_hash > bf->current_xor_hash)
2022 int journal_file_next_entry(
2025 direction_t direction,
2026 Object **ret, uint64_t *offset) {
2033 n = le64toh(f->header->n_entries);
2038 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2040 r = generic_array_bisect(f,
2041 le64toh(f->header->entry_array_offset),
2042 le64toh(f->header->n_entries),
2051 if (direction == DIRECTION_DOWN) {
2064 /* And jump to it */
2065 r = generic_array_get(f,
2066 le64toh(f->header->entry_array_offset),
2073 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2074 log_debug("%s: entry array corrupted at entry %"PRIu64,
2085 int journal_file_next_entry_for_data(
2087 Object *o, uint64_t p,
2088 uint64_t data_offset,
2089 direction_t direction,
2090 Object **ret, uint64_t *offset) {
2097 assert(p > 0 || !o);
2099 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2103 n = le64toh(d->data.n_entries);
2108 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2110 if (o->object.type != OBJECT_ENTRY)
2113 r = generic_array_bisect_plus_one(f,
2114 le64toh(d->data.entry_offset),
2115 le64toh(d->data.entry_array_offset),
2116 le64toh(d->data.n_entries),
2126 if (direction == DIRECTION_DOWN) {
2140 return generic_array_get_plus_one(f,
2141 le64toh(d->data.entry_offset),
2142 le64toh(d->data.entry_array_offset),
2147 int journal_file_move_to_entry_by_offset_for_data(
2149 uint64_t data_offset,
2151 direction_t direction,
2152 Object **ret, uint64_t *offset) {
2159 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2163 return generic_array_bisect_plus_one(f,
2164 le64toh(d->data.entry_offset),
2165 le64toh(d->data.entry_array_offset),
2166 le64toh(d->data.n_entries),
2173 int journal_file_move_to_entry_by_monotonic_for_data(
2175 uint64_t data_offset,
2178 direction_t direction,
2179 Object **ret, uint64_t *offset) {
2187 /* First, seek by time */
2188 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2194 r = generic_array_bisect_plus_one(f,
2195 le64toh(o->data.entry_offset),
2196 le64toh(o->data.entry_array_offset),
2197 le64toh(o->data.n_entries),
2199 test_object_monotonic,
2205 /* And now, continue seeking until we find an entry that
2206 * exists in both bisection arrays */
2212 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2216 r = generic_array_bisect_plus_one(f,
2217 le64toh(d->data.entry_offset),
2218 le64toh(d->data.entry_array_offset),
2219 le64toh(d->data.n_entries),
2227 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2231 r = generic_array_bisect_plus_one(f,
2232 le64toh(o->data.entry_offset),
2233 le64toh(o->data.entry_array_offset),
2234 le64toh(o->data.n_entries),
2256 int journal_file_move_to_entry_by_seqnum_for_data(
2258 uint64_t data_offset,
2260 direction_t direction,
2261 Object **ret, uint64_t *offset) {
2268 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2272 return generic_array_bisect_plus_one(f,
2273 le64toh(d->data.entry_offset),
2274 le64toh(d->data.entry_array_offset),
2275 le64toh(d->data.n_entries),
2282 int journal_file_move_to_entry_by_realtime_for_data(
2284 uint64_t data_offset,
2286 direction_t direction,
2287 Object **ret, uint64_t *offset) {
2294 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2298 return generic_array_bisect_plus_one(f,
2299 le64toh(d->data.entry_offset),
2300 le64toh(d->data.entry_array_offset),
2301 le64toh(d->data.n_entries),
2303 test_object_realtime,
2308 void journal_file_dump(JournalFile *f) {
2315 journal_file_print_header(f);
2317 p = le64toh(f->header->header_size);
2319 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2323 switch (o->object.type) {
2326 printf("Type: OBJECT_UNUSED\n");
2330 printf("Type: OBJECT_DATA\n");
2334 printf("Type: OBJECT_FIELD\n");
2338 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2339 le64toh(o->entry.seqnum),
2340 le64toh(o->entry.monotonic),
2341 le64toh(o->entry.realtime));
2344 case OBJECT_FIELD_HASH_TABLE:
2345 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2348 case OBJECT_DATA_HASH_TABLE:
2349 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2352 case OBJECT_ENTRY_ARRAY:
2353 printf("Type: OBJECT_ENTRY_ARRAY\n");
2357 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2358 le64toh(o->tag.seqnum),
2359 le64toh(o->tag.epoch));
2363 printf("Type: unknown (%u)\n", o->object.type);
2367 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2368 printf("Flags: %s\n",
2369 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2371 if (p == le64toh(f->header->tail_object_offset))
2374 p = p + ALIGN64(le64toh(o->object.size));
2379 log_error("File corrupt");
2382 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2385 x = format_timestamp(buf, l, t);
2391 void journal_file_print_header(JournalFile *f) {
2392 char a[33], b[33], c[33], d[33];
2393 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2395 char bytes[FORMAT_BYTES_MAX];
2399 printf("File Path: %s\n"
2403 "Sequential Number ID: %s\n"
2405 "Compatible Flags:%s%s\n"
2406 "Incompatible Flags:%s%s%s\n"
2407 "Header size: %"PRIu64"\n"
2408 "Arena size: %"PRIu64"\n"
2409 "Data Hash Table Size: %"PRIu64"\n"
2410 "Field Hash Table Size: %"PRIu64"\n"
2411 "Rotate Suggested: %s\n"
2412 "Head Sequential Number: %"PRIu64"\n"
2413 "Tail Sequential Number: %"PRIu64"\n"
2414 "Head Realtime Timestamp: %s\n"
2415 "Tail Realtime Timestamp: %s\n"
2416 "Tail Monotonic Timestamp: %s\n"
2417 "Objects: %"PRIu64"\n"
2418 "Entry Objects: %"PRIu64"\n",
2420 sd_id128_to_string(f->header->file_id, a),
2421 sd_id128_to_string(f->header->machine_id, b),
2422 sd_id128_to_string(f->header->boot_id, c),
2423 sd_id128_to_string(f->header->seqnum_id, d),
2424 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2425 f->header->state == STATE_ONLINE ? "ONLINE" :
2426 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2427 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2428 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2429 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2430 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2431 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2432 le64toh(f->header->header_size),
2433 le64toh(f->header->arena_size),
2434 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2435 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2436 yes_no(journal_file_rotate_suggested(f, 0)),
2437 le64toh(f->header->head_entry_seqnum),
2438 le64toh(f->header->tail_entry_seqnum),
2439 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2440 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2441 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2442 le64toh(f->header->n_objects),
2443 le64toh(f->header->n_entries));
2445 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2446 printf("Data Objects: %"PRIu64"\n"
2447 "Data Hash Table Fill: %.1f%%\n",
2448 le64toh(f->header->n_data),
2449 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2451 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2452 printf("Field Objects: %"PRIu64"\n"
2453 "Field Hash Table Fill: %.1f%%\n",
2454 le64toh(f->header->n_fields),
2455 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2457 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2458 printf("Tag Objects: %"PRIu64"\n",
2459 le64toh(f->header->n_tags));
2460 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2461 printf("Entry Array Objects: %"PRIu64"\n",
2462 le64toh(f->header->n_entry_arrays));
2464 if (fstat(f->fd, &st) >= 0)
2465 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2468 int journal_file_open(
2474 JournalMetrics *metrics,
2475 MMapCache *mmap_cache,
2476 JournalFile *template,
2477 JournalFile **ret) {
2479 bool newly_created = false;
2487 if ((flags & O_ACCMODE) != O_RDONLY &&
2488 (flags & O_ACCMODE) != O_RDWR)
2491 if (!endswith(fname, ".journal") &&
2492 !endswith(fname, ".journal~"))
2495 f = new0(JournalFile, 1);
2503 f->prot = prot_from_flags(flags);
2504 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2505 #if defined(HAVE_LZ4)
2506 f->compress_lz4 = compress;
2507 #elif defined(HAVE_XZ)
2508 f->compress_xz = compress;
2515 f->mmap = mmap_cache_ref(mmap_cache);
2517 f->mmap = mmap_cache_new();
2524 f->path = strdup(fname);
2530 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2531 if (!f->chain_cache) {
2536 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2542 if (fstat(f->fd, &f->last_stat) < 0) {
2547 if (f->last_stat.st_size == 0 && f->writable) {
2548 /* Let's attach the creation time to the journal file,
2549 * so that the vacuuming code knows the age of this
2550 * file even if the file might end up corrupted one
2551 * day... Ideally we'd just use the creation time many
2552 * file systems maintain for each file, but there is
2553 * currently no usable API to query this, hence let's
2554 * emulate this via extended attributes. If extended
2555 * attributes are not supported we'll just skip this,
2556 * and rely solely on mtime/atime/ctime of the file. */
2558 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2561 /* Try to load the FSPRG state, and if we can't, then
2562 * just don't do sealing */
2564 r = journal_file_fss_load(f);
2570 r = journal_file_init_header(f, template);
2574 if (fstat(f->fd, &f->last_stat) < 0) {
2579 newly_created = true;
2582 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2587 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2595 if (!newly_created) {
2596 r = journal_file_verify_header(f);
2602 if (!newly_created && f->writable) {
2603 r = journal_file_fss_load(f);
2611 journal_default_metrics(metrics, f->fd);
2612 f->metrics = *metrics;
2613 } else if (template)
2614 f->metrics = template->metrics;
2616 r = journal_file_refresh_header(f);
2622 r = journal_file_hmac_setup(f);
2627 if (newly_created) {
2628 r = journal_file_setup_field_hash_table(f);
2632 r = journal_file_setup_data_hash_table(f);
2637 r = journal_file_append_first_tag(f);
2643 r = journal_file_map_field_hash_table(f);
2647 r = journal_file_map_data_hash_table(f);
2651 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2660 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2663 journal_file_close(f);
2668 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2669 _cleanup_free_ char *p = NULL;
2671 JournalFile *old_file, *new_file = NULL;
2679 if (!old_file->writable)
2682 if (!endswith(old_file->path, ".journal"))
2685 l = strlen(old_file->path);
2686 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2687 (int) l - 8, old_file->path,
2688 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2689 le64toh((*f)->header->head_entry_seqnum),
2690 le64toh((*f)->header->head_entry_realtime));
2694 r = rename(old_file->path, p);
2698 old_file->header->state = STATE_ARCHIVED;
2700 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2701 journal_file_close(old_file);
2707 int journal_file_open_reliably(
2713 JournalMetrics *metrics,
2714 MMapCache *mmap_cache,
2715 JournalFile *template,
2716 JournalFile **ret) {
2720 _cleanup_free_ char *p = NULL;
2722 r = journal_file_open(fname, flags, mode, compress, seal,
2723 metrics, mmap_cache, template, ret);
2724 if (r != -EBADMSG && /* corrupted */
2725 r != -ENODATA && /* truncated */
2726 r != -EHOSTDOWN && /* other machine */
2727 r != -EPROTONOSUPPORT && /* incompatible feature */
2728 r != -EBUSY && /* unclean shutdown */
2729 r != -ESHUTDOWN && /* already archived */
2730 r != -EIO /* IO error, including SIGBUS on mmap */)
2733 if ((flags & O_ACCMODE) == O_RDONLY)
2736 if (!(flags & O_CREAT))
2739 if (!endswith(fname, ".journal"))
2742 /* The file is corrupted. Rotate it away and try it again (but only once) */
2745 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2747 (unsigned long long) now(CLOCK_REALTIME),
2751 r = rename(fname, p);
2755 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2757 return journal_file_open(fname, flags, mode, compress, seal,
2758 metrics, mmap_cache, template, ret);
2761 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2763 uint64_t q, xor_hash = 0;
2776 ts.monotonic = le64toh(o->entry.monotonic);
2777 ts.realtime = le64toh(o->entry.realtime);
2779 n = journal_file_entry_n_items(o);
2780 /* alloca() can't take 0, hence let's allocate at least one */
2781 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2783 for (i = 0; i < n; i++) {
2790 q = le64toh(o->entry.items[i].object_offset);
2791 le_hash = o->entry.items[i].hash;
2793 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2797 if (le_hash != o->data.hash)
2800 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2803 /* We hit the limit on 32bit machines */
2804 if ((uint64_t) t != l)
2807 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2808 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2811 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2812 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2816 data = from->compress_buffer;
2819 return -EPROTONOSUPPORT;
2822 data = o->data.payload;
2824 r = journal_file_append_data(to, data, l, &u, &h);
2828 xor_hash ^= le64toh(u->data.hash);
2829 items[i].object_offset = htole64(h);
2830 items[i].hash = u->data.hash;
2832 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2837 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2839 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2845 void journal_default_metrics(JournalMetrics *m, int fd) {
2846 uint64_t fs_size = 0;
2848 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2853 if (fstatvfs(fd, &ss) >= 0)
2854 fs_size = ss.f_frsize * ss.f_blocks;
2856 if (m->max_use == (uint64_t) -1) {
2859 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2861 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2862 m->max_use = DEFAULT_MAX_USE_UPPER;
2864 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2865 m->max_use = DEFAULT_MAX_USE_LOWER;
2867 m->max_use = DEFAULT_MAX_USE_LOWER;
2869 m->max_use = PAGE_ALIGN(m->max_use);
2871 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2872 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2875 if (m->max_size == (uint64_t) -1) {
2876 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2878 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2879 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2881 m->max_size = PAGE_ALIGN(m->max_size);
2883 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2884 m->max_size = JOURNAL_FILE_SIZE_MIN;
2886 if (m->max_size*2 > m->max_use)
2887 m->max_use = m->max_size*2;
2889 if (m->min_size == (uint64_t) -1)
2890 m->min_size = JOURNAL_FILE_SIZE_MIN;
2892 m->min_size = PAGE_ALIGN(m->min_size);
2894 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2895 m->min_size = JOURNAL_FILE_SIZE_MIN;
2897 if (m->min_size > m->max_size)
2898 m->max_size = m->min_size;
2901 if (m->keep_free == (uint64_t) -1) {
2904 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2906 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2907 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2910 m->keep_free = DEFAULT_KEEP_FREE;
2913 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2914 format_bytes(a, sizeof(a), m->max_use),
2915 format_bytes(b, sizeof(b), m->max_size),
2916 format_bytes(c, sizeof(c), m->min_size),
2917 format_bytes(d, sizeof(d), m->keep_free));
2920 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2925 if (f->header->head_entry_realtime == 0)
2928 *from = le64toh(f->header->head_entry_realtime);
2932 if (f->header->tail_entry_realtime == 0)
2935 *to = le64toh(f->header->tail_entry_realtime);
2941 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2949 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2953 if (le64toh(o->data.n_entries) <= 0)
2957 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2961 *from = le64toh(o->entry.monotonic);
2965 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2969 r = generic_array_get_plus_one(f,
2970 le64toh(o->data.entry_offset),
2971 le64toh(o->data.entry_array_offset),
2972 le64toh(o->data.n_entries)-1,
2977 *to = le64toh(o->entry.monotonic);
2983 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2986 /* If we gained new header fields we gained new features,
2987 * hence suggest a rotation */
2988 if (le64toh(f->header->header_size) < sizeof(Header)) {
2989 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2993 /* Let's check if the hash tables grew over a certain fill
2994 * level (75%, borrowing this value from Java's hash table
2995 * implementation), and if so suggest a rotation. To calculate
2996 * the fill level we need the n_data field, which only exists
2997 * in newer versions. */
2999 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3000 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3001 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3003 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3004 le64toh(f->header->n_data),
3005 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3006 (unsigned long long) f->last_stat.st_size,
3007 f->last_stat.st_size / le64toh(f->header->n_data));
3011 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3012 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3013 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3015 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3016 le64toh(f->header->n_fields),
3017 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3021 /* Are the data objects properly indexed by field objects? */
3022 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3023 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3024 le64toh(f->header->n_data) > 0 &&
3025 le64toh(f->header->n_fields) == 0)
3028 if (max_file_usec > 0) {
3031 h = le64toh(f->header->head_entry_realtime);
3032 t = now(CLOCK_REALTIME);
3034 if (h > 0 && t > h + max_file_usec)