1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70 /* The mmap context to use for the header we pick as one above the last defined typed */
71 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
73 static int journal_file_set_online(JournalFile *f) {
79 if (!(f->fd >= 0 && f->header))
82 if (mmap_cache_got_sigbus(f->mmap, f->fd))
85 switch(f->header->state) {
90 f->header->state = STATE_ONLINE;
99 int journal_file_set_offline(JournalFile *f) {
105 if (!(f->fd >= 0 && f->header))
108 if (f->header->state != STATE_ONLINE)
113 if (mmap_cache_got_sigbus(f->mmap, f->fd))
116 f->header->state = STATE_OFFLINE;
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
126 void journal_file_close(JournalFile *f) {
130 /* Write the final tag */
131 if (f->seal && f->writable)
132 journal_file_append_tag(f);
135 journal_file_set_offline(f);
137 if (f->mmap && f->fd >= 0)
138 mmap_cache_close_fd(f->mmap, f->fd);
144 mmap_cache_unref(f->mmap);
146 ordered_hashmap_free_free(f->chain_cache);
148 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
149 free(f->compress_buffer);
154 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
155 else if (f->fsprg_state)
156 free(f->fsprg_state);
161 gcry_md_close(f->hmac);
167 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
174 memcpy(h.signature, HEADER_SIGNATURE, 8);
175 h.header_size = htole64(ALIGN64(sizeof(h)));
177 h.incompatible_flags |= htole32(
178 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
179 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
181 h.compatible_flags = htole32(
182 f->seal * HEADER_COMPATIBLE_SEALED);
184 r = sd_id128_randomize(&h.file_id);
189 h.seqnum_id = template->header->seqnum_id;
190 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
192 h.seqnum_id = h.file_id;
194 k = pwrite(f->fd, &h, sizeof(h), 0);
204 static int journal_file_refresh_header(JournalFile *f) {
210 r = sd_id128_get_machine(&f->header->machine_id);
214 r = sd_id128_get_boot(&boot_id);
218 if (sd_id128_equal(boot_id, f->header->boot_id))
219 f->tail_entry_monotonic_valid = true;
221 f->header->boot_id = boot_id;
223 r = journal_file_set_online(f);
225 /* Sync the online state to disk */
231 static int journal_file_verify_header(JournalFile *f) {
236 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
239 /* In both read and write mode we refuse to open files with
240 * incompatible flags we don't know */
241 flags = le32toh(f->header->incompatible_flags);
242 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
243 if (flags & ~HEADER_INCOMPATIBLE_ANY)
244 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
245 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
246 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
248 log_debug("Journal file %s uses incompatible flags %"PRIx32
249 " disabled at compilation time.", f->path, flags);
250 return -EPROTONOSUPPORT;
253 /* When open for writing we refuse to open files with
254 * compatible flags, too */
255 flags = le32toh(f->header->compatible_flags);
256 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
257 if (flags & ~HEADER_COMPATIBLE_ANY)
258 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
259 f->path, flags & ~HEADER_COMPATIBLE_ANY);
260 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
262 log_debug("Journal file %s uses compatible flags %"PRIx32
263 " disabled at compilation time.", f->path, flags);
264 return -EPROTONOSUPPORT;
267 if (f->header->state >= _STATE_MAX)
270 /* The first addition was n_data, so check that we are at least this large */
271 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
274 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
277 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
280 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
283 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
284 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
285 !VALID64(le64toh(f->header->tail_object_offset)) ||
286 !VALID64(le64toh(f->header->entry_array_offset)))
291 sd_id128_t machine_id;
294 r = sd_id128_get_machine(&machine_id);
298 if (!sd_id128_equal(machine_id, f->header->machine_id))
301 state = f->header->state;
303 if (state == STATE_ONLINE) {
304 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
306 } else if (state == STATE_ARCHIVED)
308 else if (state != STATE_OFFLINE) {
309 log_debug("Journal file %s has unknown state %u.", f->path, state);
314 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
315 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
317 f->seal = JOURNAL_HEADER_SEALED(f->header);
322 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
323 uint64_t old_size, new_size;
328 /* We assume that this file is not sparse, and we know that
329 * for sure, since we always call posix_fallocate()
332 if (mmap_cache_got_sigbus(f->mmap, f->fd))
336 le64toh(f->header->header_size) +
337 le64toh(f->header->arena_size);
339 new_size = PAGE_ALIGN(offset + size);
340 if (new_size < le64toh(f->header->header_size))
341 new_size = le64toh(f->header->header_size);
343 if (new_size <= old_size)
346 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
349 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
352 if (fstatvfs(f->fd, &svfs) >= 0) {
355 available = svfs.f_bfree * svfs.f_bsize;
357 if (available >= f->metrics.keep_free)
358 available -= f->metrics.keep_free;
362 if (new_size - old_size > available)
367 /* Increase by larger blocks at once */
368 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
370 new_size = f->metrics.max_size;
372 /* Note that the glibc fallocate() fallback is very
373 inefficient, hence we try to minimize the allocation area
375 r = posix_fallocate(f->fd, old_size, new_size - old_size);
379 if (fstat(f->fd, &f->last_stat) < 0)
382 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
387 static unsigned type_to_context(ObjectType type) {
388 /* One context for each type, plus one catch-all for the rest */
389 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
390 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
391 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
394 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
401 /* Avoid SIGBUS on invalid accesses */
402 if (offset + size > (uint64_t) f->last_stat.st_size) {
403 /* Hmm, out of range? Let's refresh the fstat() data
404 * first, before we trust that check. */
406 if (fstat(f->fd, &f->last_stat) < 0 ||
407 offset + size > (uint64_t) f->last_stat.st_size)
408 return -EADDRNOTAVAIL;
411 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
414 static uint64_t minimum_header_size(Object *o) {
416 static const uint64_t table[] = {
417 [OBJECT_DATA] = sizeof(DataObject),
418 [OBJECT_FIELD] = sizeof(FieldObject),
419 [OBJECT_ENTRY] = sizeof(EntryObject),
420 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
421 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
422 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
423 [OBJECT_TAG] = sizeof(TagObject),
426 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
427 return sizeof(ObjectHeader);
429 return table[o->object.type];
432 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
441 /* Objects may only be located at multiple of 64 bit */
442 if (!VALID64(offset))
445 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
450 s = le64toh(o->object.size);
452 if (s < sizeof(ObjectHeader))
455 if (o->object.type <= OBJECT_UNUSED)
458 if (s < minimum_header_size(o))
461 if (type > OBJECT_UNUSED && o->object.type != type)
464 if (s > sizeof(ObjectHeader)) {
465 r = journal_file_move_to(f, type, false, offset, s, &t);
476 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
481 r = le64toh(f->header->tail_entry_seqnum) + 1;
484 /* If an external seqnum counter was passed, we update
485 * both the local and the external one, and set it to
486 * the maximum of both */
494 f->header->tail_entry_seqnum = htole64(r);
496 if (f->header->head_entry_seqnum == 0)
497 f->header->head_entry_seqnum = htole64(r);
502 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
509 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
510 assert(size >= sizeof(ObjectHeader));
514 r = journal_file_set_online(f);
518 p = le64toh(f->header->tail_object_offset);
520 p = le64toh(f->header->header_size);
522 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
526 p += ALIGN64(le64toh(tail->object.size));
529 r = journal_file_allocate(f, p, size);
533 r = journal_file_move_to(f, type, false, p, size, &t);
540 o->object.type = type;
541 o->object.size = htole64(size);
543 f->header->tail_object_offset = htole64(p);
544 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
552 static int journal_file_setup_data_hash_table(JournalFile *f) {
559 /* We estimate that we need 1 hash table entry per 768 of
560 journal file and we want to make sure we never get beyond
561 75% fill level. Calculate the hash table size for the
562 maximum file size based on these metrics. */
564 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
565 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
566 s = DEFAULT_DATA_HASH_TABLE_SIZE;
568 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
570 r = journal_file_append_object(f,
571 OBJECT_DATA_HASH_TABLE,
572 offsetof(Object, hash_table.items) + s,
577 memzero(o->hash_table.items, s);
579 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
580 f->header->data_hash_table_size = htole64(s);
585 static int journal_file_setup_field_hash_table(JournalFile *f) {
592 /* We use a fixed size hash table for the fields as this
593 * number should grow very slowly only */
595 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
596 r = journal_file_append_object(f,
597 OBJECT_FIELD_HASH_TABLE,
598 offsetof(Object, hash_table.items) + s,
603 memzero(o->hash_table.items, s);
605 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
606 f->header->field_hash_table_size = htole64(s);
611 static int journal_file_map_data_hash_table(JournalFile *f) {
618 p = le64toh(f->header->data_hash_table_offset);
619 s = le64toh(f->header->data_hash_table_size);
621 r = journal_file_move_to(f,
622 OBJECT_DATA_HASH_TABLE,
629 f->data_hash_table = t;
633 static int journal_file_map_field_hash_table(JournalFile *f) {
640 p = le64toh(f->header->field_hash_table_offset);
641 s = le64toh(f->header->field_hash_table_size);
643 r = journal_file_move_to(f,
644 OBJECT_FIELD_HASH_TABLE,
651 f->field_hash_table = t;
655 static int journal_file_link_field(
668 if (o->object.type != OBJECT_FIELD)
671 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
675 /* This might alter the window we are looking at */
676 o->field.next_hash_offset = o->field.head_data_offset = 0;
679 p = le64toh(f->field_hash_table[h].tail_hash_offset);
681 f->field_hash_table[h].head_hash_offset = htole64(offset);
683 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
687 o->field.next_hash_offset = htole64(offset);
690 f->field_hash_table[h].tail_hash_offset = htole64(offset);
692 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
693 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
698 static int journal_file_link_data(
711 if (o->object.type != OBJECT_DATA)
714 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
718 /* This might alter the window we are looking at */
719 o->data.next_hash_offset = o->data.next_field_offset = 0;
720 o->data.entry_offset = o->data.entry_array_offset = 0;
721 o->data.n_entries = 0;
724 p = le64toh(f->data_hash_table[h].tail_hash_offset);
726 /* Only entry in the hash table is easy */
727 f->data_hash_table[h].head_hash_offset = htole64(offset);
729 /* Move back to the previous data object, to patch in
732 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
736 o->data.next_hash_offset = htole64(offset);
739 f->data_hash_table[h].tail_hash_offset = htole64(offset);
741 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
742 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
747 int journal_file_find_field_object_with_hash(
749 const void *field, uint64_t size, uint64_t hash,
750 Object **ret, uint64_t *offset) {
752 uint64_t p, osize, h, m;
756 assert(field && size > 0);
758 osize = offsetof(Object, field.payload) + size;
760 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
766 p = le64toh(f->field_hash_table[h].head_hash_offset);
771 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
775 if (le64toh(o->field.hash) == hash &&
776 le64toh(o->object.size) == osize &&
777 memcmp(o->field.payload, field, size) == 0) {
787 p = le64toh(o->field.next_hash_offset);
793 int journal_file_find_field_object(
795 const void *field, uint64_t size,
796 Object **ret, uint64_t *offset) {
801 assert(field && size > 0);
803 hash = hash64(field, size);
805 return journal_file_find_field_object_with_hash(f,
810 int journal_file_find_data_object_with_hash(
812 const void *data, uint64_t size, uint64_t hash,
813 Object **ret, uint64_t *offset) {
815 uint64_t p, osize, h, m;
819 assert(data || size == 0);
821 osize = offsetof(Object, data.payload) + size;
823 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
828 p = le64toh(f->data_hash_table[h].head_hash_offset);
833 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
837 if (le64toh(o->data.hash) != hash)
840 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
841 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
845 l = le64toh(o->object.size);
846 if (l <= offsetof(Object, data.payload))
849 l -= offsetof(Object, data.payload);
851 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
852 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
857 memcmp(f->compress_buffer, data, size) == 0) {
868 return -EPROTONOSUPPORT;
870 } else if (le64toh(o->object.size) == osize &&
871 memcmp(o->data.payload, data, size) == 0) {
883 p = le64toh(o->data.next_hash_offset);
889 int journal_file_find_data_object(
891 const void *data, uint64_t size,
892 Object **ret, uint64_t *offset) {
897 assert(data || size == 0);
899 hash = hash64(data, size);
901 return journal_file_find_data_object_with_hash(f,
906 static int journal_file_append_field(
908 const void *field, uint64_t size,
909 Object **ret, uint64_t *offset) {
917 assert(field && size > 0);
919 hash = hash64(field, size);
921 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
935 osize = offsetof(Object, field.payload) + size;
936 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
940 o->field.hash = htole64(hash);
941 memcpy(o->field.payload, field, size);
943 r = journal_file_link_field(f, o, p, hash);
947 /* The linking might have altered the window, so let's
948 * refresh our pointer */
949 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
954 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
968 static int journal_file_append_data(
970 const void *data, uint64_t size,
971 Object **ret, uint64_t *offset) {
976 int r, compression = 0;
980 assert(data || size == 0);
982 hash = hash64(data, size);
984 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
998 osize = offsetof(Object, data.payload) + size;
999 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1003 o->data.hash = htole64(hash);
1005 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1006 if (f->compress_xz &&
1007 size >= COMPRESSION_SIZE_THRESHOLD) {
1010 compression = compress_blob(data, size, o->data.payload, &rsize);
1013 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1014 o->object.flags |= compression;
1016 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1017 size, rsize, object_compressed_to_string(compression));
1022 if (!compression && size > 0)
1023 memcpy(o->data.payload, data, size);
1025 r = journal_file_link_data(f, o, p, hash);
1029 /* The linking might have altered the window, so let's
1030 * refresh our pointer */
1031 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1038 eq = memchr(data, '=', size);
1039 if (eq && eq > data) {
1043 /* Create field object ... */
1044 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1048 /* ... and link it in. */
1049 o->data.next_field_offset = fo->field.head_data_offset;
1050 fo->field.head_data_offset = le64toh(p);
1054 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1068 uint64_t journal_file_entry_n_items(Object *o) {
1071 if (o->object.type != OBJECT_ENTRY)
1074 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1077 uint64_t journal_file_entry_array_n_items(Object *o) {
1080 if (o->object.type != OBJECT_ENTRY_ARRAY)
1083 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1086 uint64_t journal_file_hash_table_n_items(Object *o) {
1089 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1090 o->object.type != OBJECT_FIELD_HASH_TABLE)
1093 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1096 static int link_entry_into_array(JournalFile *f,
1101 uint64_t n = 0, ap = 0, q, i, a, hidx;
1109 a = le64toh(*first);
1110 i = hidx = le64toh(*idx);
1113 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1117 n = journal_file_entry_array_n_items(o);
1119 o->entry_array.items[i] = htole64(p);
1120 *idx = htole64(hidx + 1);
1126 a = le64toh(o->entry_array.next_entry_array_offset);
1137 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1138 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1144 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1149 o->entry_array.items[i] = htole64(p);
1152 *first = htole64(q);
1154 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1158 o->entry_array.next_entry_array_offset = htole64(q);
1161 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1162 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1164 *idx = htole64(hidx + 1);
1169 static int link_entry_into_array_plus_one(JournalFile *f,
1184 *extra = htole64(p);
1188 i = htole64(le64toh(*idx) - 1);
1189 r = link_entry_into_array(f, first, &i, p);
1194 *idx = htole64(le64toh(*idx) + 1);
1198 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1205 p = le64toh(o->entry.items[i].object_offset);
1209 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1213 return link_entry_into_array_plus_one(f,
1214 &o->data.entry_offset,
1215 &o->data.entry_array_offset,
1220 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1228 if (o->object.type != OBJECT_ENTRY)
1231 __sync_synchronize();
1233 /* Link up the entry itself */
1234 r = link_entry_into_array(f,
1235 &f->header->entry_array_offset,
1236 &f->header->n_entries,
1241 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1243 if (f->header->head_entry_realtime == 0)
1244 f->header->head_entry_realtime = o->entry.realtime;
1246 f->header->tail_entry_realtime = o->entry.realtime;
1247 f->header->tail_entry_monotonic = o->entry.monotonic;
1249 f->tail_entry_monotonic_valid = true;
1251 /* Link up the items */
1252 n = journal_file_entry_n_items(o);
1253 for (i = 0; i < n; i++) {
1254 r = journal_file_link_entry_item(f, o, offset, i);
1262 static int journal_file_append_entry_internal(
1264 const dual_timestamp *ts,
1266 const EntryItem items[], unsigned n_items,
1268 Object **ret, uint64_t *offset) {
1275 assert(items || n_items == 0);
1278 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1280 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1284 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1285 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1286 o->entry.realtime = htole64(ts->realtime);
1287 o->entry.monotonic = htole64(ts->monotonic);
1288 o->entry.xor_hash = htole64(xor_hash);
1289 o->entry.boot_id = f->header->boot_id;
1292 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1297 r = journal_file_link_entry(f, o, np);
1310 void journal_file_post_change(JournalFile *f) {
1313 /* inotify() does not receive IN_MODIFY events from file
1314 * accesses done via mmap(). After each access we hence
1315 * trigger IN_MODIFY by truncating the journal file to its
1316 * current size which triggers IN_MODIFY. */
1318 __sync_synchronize();
1320 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1321 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1324 static int entry_item_cmp(const void *_a, const void *_b) {
1325 const EntryItem *a = _a, *b = _b;
1327 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1329 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1334 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1338 uint64_t xor_hash = 0;
1339 struct dual_timestamp _ts;
1342 assert(iovec || n_iovec == 0);
1345 dual_timestamp_get(&_ts);
1349 if (f->tail_entry_monotonic_valid &&
1350 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1354 r = journal_file_maybe_append_tag(f, ts->realtime);
1359 /* alloca() can't take 0, hence let's allocate at least one */
1360 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1362 for (i = 0; i < n_iovec; i++) {
1366 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1370 xor_hash ^= le64toh(o->data.hash);
1371 items[i].object_offset = htole64(p);
1372 items[i].hash = o->data.hash;
1375 /* Order by the position on disk, in order to improve seek
1376 * times for rotating media. */
1377 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1379 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1381 /* If the memory mapping triggered a SIGBUS then we return an
1382 * IO error and ignore the error code passed down to us, since
1383 * it is very likely just an effect of a nullified replacement
1386 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1389 journal_file_post_change(f);
1394 typedef struct ChainCacheItem {
1395 uint64_t first; /* the array at the beginning of the chain */
1396 uint64_t array; /* the cached array */
1397 uint64_t begin; /* the first item in the cached array */
1398 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1399 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1402 static void chain_cache_put(
1409 uint64_t last_index) {
1412 /* If the chain item to cache for this chain is the
1413 * first one it's not worth caching anything */
1417 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1418 ci = ordered_hashmap_steal_first(h);
1421 ci = new(ChainCacheItem, 1);
1428 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1433 assert(ci->first == first);
1438 ci->last_index = last_index;
1441 static int generic_array_get(
1445 Object **ret, uint64_t *offset) {
1448 uint64_t p = 0, a, t = 0;
1456 /* Try the chain cache first */
1457 ci = ordered_hashmap_get(f->chain_cache, &first);
1458 if (ci && i > ci->total) {
1467 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1471 k = journal_file_entry_array_n_items(o);
1473 p = le64toh(o->entry_array.items[i]);
1479 a = le64toh(o->entry_array.next_entry_array_offset);
1485 /* Let's cache this item for the next invocation */
1486 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1488 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1501 static int generic_array_get_plus_one(
1506 Object **ret, uint64_t *offset) {
1515 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1528 return generic_array_get(f, first, i-1, ret, offset);
1537 static int generic_array_bisect(
1542 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1543 direction_t direction,
1548 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1549 bool subtract_one = false;
1550 Object *o, *array = NULL;
1555 assert(test_object);
1557 /* Start with the first array in the chain */
1560 ci = ordered_hashmap_get(f->chain_cache, &first);
1561 if (ci && n > ci->total) {
1562 /* Ah, we have iterated this bisection array chain
1563 * previously! Let's see if we can skip ahead in the
1564 * chain, as far as the last time. But we can't jump
1565 * backwards in the chain, so let's check that
1568 r = test_object(f, ci->begin, needle);
1572 if (r == TEST_LEFT) {
1573 /* OK, what we are looking for is right of the
1574 * begin of this EntryArray, so let's jump
1575 * straight to previously cached array in the
1581 last_index = ci->last_index;
1586 uint64_t left, right, k, lp;
1588 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1592 k = journal_file_entry_array_n_items(array);
1598 lp = p = le64toh(array->entry_array.items[i]);
1602 r = test_object(f, p, needle);
1606 if (r == TEST_FOUND)
1607 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1609 if (r == TEST_RIGHT) {
1613 if (last_index != (uint64_t) -1) {
1614 assert(last_index <= right);
1616 /* If we cached the last index we
1617 * looked at, let's try to not to jump
1618 * too wildly around and see if we can
1619 * limit the range to look at early to
1620 * the immediate neighbors of the last
1621 * index we looked at. */
1623 if (last_index > 0) {
1624 uint64_t x = last_index - 1;
1626 p = le64toh(array->entry_array.items[x]);
1630 r = test_object(f, p, needle);
1634 if (r == TEST_FOUND)
1635 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1637 if (r == TEST_RIGHT)
1643 if (last_index < right) {
1644 uint64_t y = last_index + 1;
1646 p = le64toh(array->entry_array.items[y]);
1650 r = test_object(f, p, needle);
1654 if (r == TEST_FOUND)
1655 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1657 if (r == TEST_RIGHT)
1665 if (left == right) {
1666 if (direction == DIRECTION_UP)
1667 subtract_one = true;
1673 assert(left < right);
1674 i = (left + right) / 2;
1676 p = le64toh(array->entry_array.items[i]);
1680 r = test_object(f, p, needle);
1684 if (r == TEST_FOUND)
1685 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1687 if (r == TEST_RIGHT)
1695 if (direction == DIRECTION_UP) {
1697 subtract_one = true;
1708 last_index = (uint64_t) -1;
1709 a = le64toh(array->entry_array.next_entry_array_offset);
1715 if (subtract_one && t == 0 && i == 0)
1718 /* Let's cache this item for the next invocation */
1719 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1721 if (subtract_one && i == 0)
1723 else if (subtract_one)
1724 p = le64toh(array->entry_array.items[i-1]);
1726 p = le64toh(array->entry_array.items[i]);
1728 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1739 *idx = t + i + (subtract_one ? -1 : 0);
1744 static int generic_array_bisect_plus_one(
1750 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1751 direction_t direction,
1757 bool step_back = false;
1761 assert(test_object);
1766 /* This bisects the array in object 'first', but first checks
1768 r = test_object(f, extra, needle);
1772 if (r == TEST_FOUND)
1773 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1775 /* if we are looking with DIRECTION_UP then we need to first
1776 see if in the actual array there is a matching entry, and
1777 return the last one of that. But if there isn't any we need
1778 to return this one. Hence remember this, and return it
1781 step_back = direction == DIRECTION_UP;
1783 if (r == TEST_RIGHT) {
1784 if (direction == DIRECTION_DOWN)
1790 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1792 if (r == 0 && step_back)
1801 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1817 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1823 else if (p < needle)
1829 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1836 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1840 if (le64toh(o->entry.seqnum) == needle)
1842 else if (le64toh(o->entry.seqnum) < needle)
1848 int journal_file_move_to_entry_by_seqnum(
1851 direction_t direction,
1855 return generic_array_bisect(f,
1856 le64toh(f->header->entry_array_offset),
1857 le64toh(f->header->n_entries),
1864 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1871 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1875 if (le64toh(o->entry.realtime) == needle)
1877 else if (le64toh(o->entry.realtime) < needle)
1883 int journal_file_move_to_entry_by_realtime(
1886 direction_t direction,
1890 return generic_array_bisect(f,
1891 le64toh(f->header->entry_array_offset),
1892 le64toh(f->header->n_entries),
1894 test_object_realtime,
1899 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1906 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1910 if (le64toh(o->entry.monotonic) == needle)
1912 else if (le64toh(o->entry.monotonic) < needle)
1918 static inline int find_data_object_by_boot_id(
1923 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1925 sd_id128_to_string(boot_id, t + 9);
1926 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1929 int journal_file_move_to_entry_by_monotonic(
1933 direction_t direction,
1942 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1948 return generic_array_bisect_plus_one(f,
1949 le64toh(o->data.entry_offset),
1950 le64toh(o->data.entry_array_offset),
1951 le64toh(o->data.n_entries),
1953 test_object_monotonic,
1958 void journal_file_reset_location(JournalFile *f) {
1959 f->location_type = LOCATION_HEAD;
1960 f->current_offset = 0;
1961 f->current_seqnum = 0;
1962 f->current_realtime = 0;
1963 f->current_monotonic = 0;
1964 zero(f->current_boot_id);
1965 f->current_xor_hash = 0;
1968 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1969 f->last_direction = direction;
1970 f->location_type = LOCATION_SEEK;
1971 f->current_offset = offset;
1972 f->current_seqnum = le64toh(o->entry.seqnum);
1973 f->current_realtime = le64toh(o->entry.realtime);
1974 f->current_monotonic = le64toh(o->entry.monotonic);
1975 f->current_boot_id = o->entry.boot_id;
1976 f->current_xor_hash = le64toh(o->entry.xor_hash);
1979 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1982 assert(af->location_type == LOCATION_SEEK);
1983 assert(bf->location_type == LOCATION_SEEK);
1985 /* If contents and timestamps match, these entries are
1986 * identical, even if the seqnum does not match */
1987 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1988 af->current_monotonic == bf->current_monotonic &&
1989 af->current_realtime == bf->current_realtime &&
1990 af->current_xor_hash == bf->current_xor_hash)
1993 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1995 /* If this is from the same seqnum source, compare
1997 if (af->current_seqnum < bf->current_seqnum)
1999 if (af->current_seqnum > bf->current_seqnum)
2002 /* Wow! This is weird, different data but the same
2003 * seqnums? Something is borked, but let's make the
2004 * best of it and compare by time. */
2007 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2009 /* If the boot id matches, compare monotonic time */
2010 if (af->current_monotonic < bf->current_monotonic)
2012 if (af->current_monotonic > bf->current_monotonic)
2016 /* Otherwise, compare UTC time */
2017 if (af->current_realtime < bf->current_realtime)
2019 if (af->current_realtime > bf->current_realtime)
2022 /* Finally, compare by contents */
2023 if (af->current_xor_hash < bf->current_xor_hash)
2025 if (af->current_xor_hash > bf->current_xor_hash)
2031 int journal_file_next_entry(
2034 direction_t direction,
2035 Object **ret, uint64_t *offset) {
2042 n = le64toh(f->header->n_entries);
2047 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2049 r = generic_array_bisect(f,
2050 le64toh(f->header->entry_array_offset),
2051 le64toh(f->header->n_entries),
2060 if (direction == DIRECTION_DOWN) {
2073 /* And jump to it */
2074 r = generic_array_get(f,
2075 le64toh(f->header->entry_array_offset),
2082 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2083 log_debug("%s: entry array corrupted at entry %"PRIu64,
2094 int journal_file_next_entry_for_data(
2096 Object *o, uint64_t p,
2097 uint64_t data_offset,
2098 direction_t direction,
2099 Object **ret, uint64_t *offset) {
2106 assert(p > 0 || !o);
2108 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2112 n = le64toh(d->data.n_entries);
2117 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2119 if (o->object.type != OBJECT_ENTRY)
2122 r = generic_array_bisect_plus_one(f,
2123 le64toh(d->data.entry_offset),
2124 le64toh(d->data.entry_array_offset),
2125 le64toh(d->data.n_entries),
2135 if (direction == DIRECTION_DOWN) {
2149 return generic_array_get_plus_one(f,
2150 le64toh(d->data.entry_offset),
2151 le64toh(d->data.entry_array_offset),
2156 int journal_file_move_to_entry_by_offset_for_data(
2158 uint64_t data_offset,
2160 direction_t direction,
2161 Object **ret, uint64_t *offset) {
2168 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2172 return generic_array_bisect_plus_one(f,
2173 le64toh(d->data.entry_offset),
2174 le64toh(d->data.entry_array_offset),
2175 le64toh(d->data.n_entries),
2182 int journal_file_move_to_entry_by_monotonic_for_data(
2184 uint64_t data_offset,
2187 direction_t direction,
2188 Object **ret, uint64_t *offset) {
2196 /* First, seek by time */
2197 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2203 r = generic_array_bisect_plus_one(f,
2204 le64toh(o->data.entry_offset),
2205 le64toh(o->data.entry_array_offset),
2206 le64toh(o->data.n_entries),
2208 test_object_monotonic,
2214 /* And now, continue seeking until we find an entry that
2215 * exists in both bisection arrays */
2221 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2225 r = generic_array_bisect_plus_one(f,
2226 le64toh(d->data.entry_offset),
2227 le64toh(d->data.entry_array_offset),
2228 le64toh(d->data.n_entries),
2236 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2240 r = generic_array_bisect_plus_one(f,
2241 le64toh(o->data.entry_offset),
2242 le64toh(o->data.entry_array_offset),
2243 le64toh(o->data.n_entries),
2265 int journal_file_move_to_entry_by_seqnum_for_data(
2267 uint64_t data_offset,
2269 direction_t direction,
2270 Object **ret, uint64_t *offset) {
2277 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2281 return generic_array_bisect_plus_one(f,
2282 le64toh(d->data.entry_offset),
2283 le64toh(d->data.entry_array_offset),
2284 le64toh(d->data.n_entries),
2291 int journal_file_move_to_entry_by_realtime_for_data(
2293 uint64_t data_offset,
2295 direction_t direction,
2296 Object **ret, uint64_t *offset) {
2303 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2307 return generic_array_bisect_plus_one(f,
2308 le64toh(d->data.entry_offset),
2309 le64toh(d->data.entry_array_offset),
2310 le64toh(d->data.n_entries),
2312 test_object_realtime,
2317 void journal_file_dump(JournalFile *f) {
2324 journal_file_print_header(f);
2326 p = le64toh(f->header->header_size);
2328 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2332 switch (o->object.type) {
2335 printf("Type: OBJECT_UNUSED\n");
2339 printf("Type: OBJECT_DATA\n");
2343 printf("Type: OBJECT_FIELD\n");
2347 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2348 le64toh(o->entry.seqnum),
2349 le64toh(o->entry.monotonic),
2350 le64toh(o->entry.realtime));
2353 case OBJECT_FIELD_HASH_TABLE:
2354 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2357 case OBJECT_DATA_HASH_TABLE:
2358 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2361 case OBJECT_ENTRY_ARRAY:
2362 printf("Type: OBJECT_ENTRY_ARRAY\n");
2366 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2367 le64toh(o->tag.seqnum),
2368 le64toh(o->tag.epoch));
2372 printf("Type: unknown (%u)\n", o->object.type);
2376 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2377 printf("Flags: %s\n",
2378 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2380 if (p == le64toh(f->header->tail_object_offset))
2383 p = p + ALIGN64(le64toh(o->object.size));
2388 log_error("File corrupt");
2391 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2394 x = format_timestamp(buf, l, t);
2400 void journal_file_print_header(JournalFile *f) {
2401 char a[33], b[33], c[33], d[33];
2402 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2404 char bytes[FORMAT_BYTES_MAX];
2408 printf("File Path: %s\n"
2412 "Sequential Number ID: %s\n"
2414 "Compatible Flags:%s%s\n"
2415 "Incompatible Flags:%s%s%s\n"
2416 "Header size: %"PRIu64"\n"
2417 "Arena size: %"PRIu64"\n"
2418 "Data Hash Table Size: %"PRIu64"\n"
2419 "Field Hash Table Size: %"PRIu64"\n"
2420 "Rotate Suggested: %s\n"
2421 "Head Sequential Number: %"PRIu64"\n"
2422 "Tail Sequential Number: %"PRIu64"\n"
2423 "Head Realtime Timestamp: %s\n"
2424 "Tail Realtime Timestamp: %s\n"
2425 "Tail Monotonic Timestamp: %s\n"
2426 "Objects: %"PRIu64"\n"
2427 "Entry Objects: %"PRIu64"\n",
2429 sd_id128_to_string(f->header->file_id, a),
2430 sd_id128_to_string(f->header->machine_id, b),
2431 sd_id128_to_string(f->header->boot_id, c),
2432 sd_id128_to_string(f->header->seqnum_id, d),
2433 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2434 f->header->state == STATE_ONLINE ? "ONLINE" :
2435 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2436 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2437 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2438 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2439 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2440 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2441 le64toh(f->header->header_size),
2442 le64toh(f->header->arena_size),
2443 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2444 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2445 yes_no(journal_file_rotate_suggested(f, 0)),
2446 le64toh(f->header->head_entry_seqnum),
2447 le64toh(f->header->tail_entry_seqnum),
2448 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2449 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2450 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2451 le64toh(f->header->n_objects),
2452 le64toh(f->header->n_entries));
2454 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2455 printf("Data Objects: %"PRIu64"\n"
2456 "Data Hash Table Fill: %.1f%%\n",
2457 le64toh(f->header->n_data),
2458 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2460 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2461 printf("Field Objects: %"PRIu64"\n"
2462 "Field Hash Table Fill: %.1f%%\n",
2463 le64toh(f->header->n_fields),
2464 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2466 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2467 printf("Tag Objects: %"PRIu64"\n",
2468 le64toh(f->header->n_tags));
2469 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2470 printf("Entry Array Objects: %"PRIu64"\n",
2471 le64toh(f->header->n_entry_arrays));
2473 if (fstat(f->fd, &st) >= 0)
2474 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2477 int journal_file_open(
2483 JournalMetrics *metrics,
2484 MMapCache *mmap_cache,
2485 JournalFile *template,
2486 JournalFile **ret) {
2488 bool newly_created = false;
2496 if ((flags & O_ACCMODE) != O_RDONLY &&
2497 (flags & O_ACCMODE) != O_RDWR)
2500 if (!endswith(fname, ".journal") &&
2501 !endswith(fname, ".journal~"))
2504 f = new0(JournalFile, 1);
2512 f->prot = prot_from_flags(flags);
2513 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2514 #if defined(HAVE_LZ4)
2515 f->compress_lz4 = compress;
2516 #elif defined(HAVE_XZ)
2517 f->compress_xz = compress;
2524 f->mmap = mmap_cache_ref(mmap_cache);
2526 f->mmap = mmap_cache_new();
2533 f->path = strdup(fname);
2539 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2540 if (!f->chain_cache) {
2545 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2551 if (fstat(f->fd, &f->last_stat) < 0) {
2556 if (f->last_stat.st_size == 0 && f->writable) {
2557 /* Let's attach the creation time to the journal file,
2558 * so that the vacuuming code knows the age of this
2559 * file even if the file might end up corrupted one
2560 * day... Ideally we'd just use the creation time many
2561 * file systems maintain for each file, but there is
2562 * currently no usable API to query this, hence let's
2563 * emulate this via extended attributes. If extended
2564 * attributes are not supported we'll just skip this,
2565 * and rely solely on mtime/atime/ctime of the file. */
2567 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2570 /* Try to load the FSPRG state, and if we can't, then
2571 * just don't do sealing */
2573 r = journal_file_fss_load(f);
2579 r = journal_file_init_header(f, template);
2583 if (fstat(f->fd, &f->last_stat) < 0) {
2588 newly_created = true;
2591 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2596 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2604 if (!newly_created) {
2605 r = journal_file_verify_header(f);
2611 if (!newly_created && f->writable) {
2612 r = journal_file_fss_load(f);
2620 journal_default_metrics(metrics, f->fd);
2621 f->metrics = *metrics;
2622 } else if (template)
2623 f->metrics = template->metrics;
2625 r = journal_file_refresh_header(f);
2631 r = journal_file_hmac_setup(f);
2636 if (newly_created) {
2637 r = journal_file_setup_field_hash_table(f);
2641 r = journal_file_setup_data_hash_table(f);
2646 r = journal_file_append_first_tag(f);
2652 r = journal_file_map_field_hash_table(f);
2656 r = journal_file_map_data_hash_table(f);
2660 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2669 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2672 journal_file_close(f);
2677 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2678 _cleanup_free_ char *p = NULL;
2680 JournalFile *old_file, *new_file = NULL;
2688 if (!old_file->writable)
2691 if (!endswith(old_file->path, ".journal"))
2694 l = strlen(old_file->path);
2695 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2696 (int) l - 8, old_file->path,
2697 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2698 le64toh((*f)->header->head_entry_seqnum),
2699 le64toh((*f)->header->head_entry_realtime));
2703 r = rename(old_file->path, p);
2707 old_file->header->state = STATE_ARCHIVED;
2709 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2710 journal_file_close(old_file);
2716 int journal_file_open_reliably(
2722 JournalMetrics *metrics,
2723 MMapCache *mmap_cache,
2724 JournalFile *template,
2725 JournalFile **ret) {
2729 _cleanup_free_ char *p = NULL;
2731 r = journal_file_open(fname, flags, mode, compress, seal,
2732 metrics, mmap_cache, template, ret);
2733 if (r != -EBADMSG && /* corrupted */
2734 r != -ENODATA && /* truncated */
2735 r != -EHOSTDOWN && /* other machine */
2736 r != -EPROTONOSUPPORT && /* incompatible feature */
2737 r != -EBUSY && /* unclean shutdown */
2738 r != -ESHUTDOWN && /* already archived */
2739 r != -EIO /* IO error, including SIGBUS on mmap */)
2742 if ((flags & O_ACCMODE) == O_RDONLY)
2745 if (!(flags & O_CREAT))
2748 if (!endswith(fname, ".journal"))
2751 /* The file is corrupted. Rotate it away and try it again (but only once) */
2754 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2756 (unsigned long long) now(CLOCK_REALTIME),
2760 r = rename(fname, p);
2764 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2766 return journal_file_open(fname, flags, mode, compress, seal,
2767 metrics, mmap_cache, template, ret);
2770 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2772 uint64_t q, xor_hash = 0;
2785 ts.monotonic = le64toh(o->entry.monotonic);
2786 ts.realtime = le64toh(o->entry.realtime);
2788 n = journal_file_entry_n_items(o);
2789 /* alloca() can't take 0, hence let's allocate at least one */
2790 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2792 for (i = 0; i < n; i++) {
2799 q = le64toh(o->entry.items[i].object_offset);
2800 le_hash = o->entry.items[i].hash;
2802 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2806 if (le_hash != o->data.hash)
2809 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2812 /* We hit the limit on 32bit machines */
2813 if ((uint64_t) t != l)
2816 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2817 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2820 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2821 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2825 data = from->compress_buffer;
2828 return -EPROTONOSUPPORT;
2831 data = o->data.payload;
2833 r = journal_file_append_data(to, data, l, &u, &h);
2837 xor_hash ^= le64toh(u->data.hash);
2838 items[i].object_offset = htole64(h);
2839 items[i].hash = u->data.hash;
2841 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2846 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2848 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2854 void journal_default_metrics(JournalMetrics *m, int fd) {
2855 uint64_t fs_size = 0;
2857 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2862 if (fstatvfs(fd, &ss) >= 0)
2863 fs_size = ss.f_frsize * ss.f_blocks;
2865 if (m->max_use == (uint64_t) -1) {
2868 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2870 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2871 m->max_use = DEFAULT_MAX_USE_UPPER;
2873 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2874 m->max_use = DEFAULT_MAX_USE_LOWER;
2876 m->max_use = DEFAULT_MAX_USE_LOWER;
2878 m->max_use = PAGE_ALIGN(m->max_use);
2880 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2881 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2884 if (m->max_size == (uint64_t) -1) {
2885 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2887 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2888 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2890 m->max_size = PAGE_ALIGN(m->max_size);
2892 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2893 m->max_size = JOURNAL_FILE_SIZE_MIN;
2895 if (m->max_size*2 > m->max_use)
2896 m->max_use = m->max_size*2;
2898 if (m->min_size == (uint64_t) -1)
2899 m->min_size = JOURNAL_FILE_SIZE_MIN;
2901 m->min_size = PAGE_ALIGN(m->min_size);
2903 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2904 m->min_size = JOURNAL_FILE_SIZE_MIN;
2906 if (m->min_size > m->max_size)
2907 m->max_size = m->min_size;
2910 if (m->keep_free == (uint64_t) -1) {
2913 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2915 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2916 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2919 m->keep_free = DEFAULT_KEEP_FREE;
2922 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2923 format_bytes(a, sizeof(a), m->max_use),
2924 format_bytes(b, sizeof(b), m->max_size),
2925 format_bytes(c, sizeof(c), m->min_size),
2926 format_bytes(d, sizeof(d), m->keep_free));
2929 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2934 if (f->header->head_entry_realtime == 0)
2937 *from = le64toh(f->header->head_entry_realtime);
2941 if (f->header->tail_entry_realtime == 0)
2944 *to = le64toh(f->header->tail_entry_realtime);
2950 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2958 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2962 if (le64toh(o->data.n_entries) <= 0)
2966 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2970 *from = le64toh(o->entry.monotonic);
2974 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2978 r = generic_array_get_plus_one(f,
2979 le64toh(o->data.entry_offset),
2980 le64toh(o->data.entry_array_offset),
2981 le64toh(o->data.n_entries)-1,
2986 *to = le64toh(o->entry.monotonic);
2992 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2995 /* If we gained new header fields we gained new features,
2996 * hence suggest a rotation */
2997 if (le64toh(f->header->header_size) < sizeof(Header)) {
2998 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3002 /* Let's check if the hash tables grew over a certain fill
3003 * level (75%, borrowing this value from Java's hash table
3004 * implementation), and if so suggest a rotation. To calculate
3005 * the fill level we need the n_data field, which only exists
3006 * in newer versions. */
3008 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3009 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3010 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3012 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3013 le64toh(f->header->n_data),
3014 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3015 (unsigned long long) f->last_stat.st_size,
3016 f->last_stat.st_size / le64toh(f->header->n_data));
3020 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3021 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3022 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3024 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3025 le64toh(f->header->n_fields),
3026 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3030 /* Are the data objects properly indexed by field objects? */
3031 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3032 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3033 le64toh(f->header->n_data) > 0 &&
3034 le64toh(f->header->n_fields) == 0)
3037 if (max_file_usec > 0) {
3040 h = le64toh(f->header->head_entry_realtime);
3041 t = now(CLOCK_REALTIME);
3043 if (h > 0 && t > h + max_file_usec)