1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70 static int journal_file_set_online(JournalFile *f) {
76 if (!(f->fd >= 0 && f->header))
79 switch(f->header->state) {
84 f->header->state = STATE_ONLINE;
93 int journal_file_set_offline(JournalFile *f) {
99 if (!(f->fd >= 0 && f->header))
102 if (f->header->state != STATE_ONLINE)
107 f->header->state = STATE_OFFLINE;
114 void journal_file_close(JournalFile *f) {
118 /* Write the final tag */
119 if (f->seal && f->writable)
120 journal_file_append_tag(f);
123 /* Sync everything to disk, before we mark the file offline */
124 if (f->mmap && f->fd >= 0)
125 mmap_cache_close_fd(f->mmap, f->fd);
127 journal_file_set_offline(f);
130 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
136 mmap_cache_unref(f->mmap);
138 ordered_hashmap_free_free(f->chain_cache);
140 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
141 free(f->compress_buffer);
146 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
147 else if (f->fsprg_state)
148 free(f->fsprg_state);
153 gcry_md_close(f->hmac);
159 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
166 memcpy(h.signature, HEADER_SIGNATURE, 8);
167 h.header_size = htole64(ALIGN64(sizeof(h)));
169 h.incompatible_flags |= htole32(
170 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
171 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
173 h.compatible_flags = htole32(
174 f->seal * HEADER_COMPATIBLE_SEALED);
176 r = sd_id128_randomize(&h.file_id);
181 h.seqnum_id = template->header->seqnum_id;
182 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
184 h.seqnum_id = h.file_id;
186 k = pwrite(f->fd, &h, sizeof(h), 0);
196 static int journal_file_refresh_header(JournalFile *f) {
202 r = sd_id128_get_machine(&f->header->machine_id);
206 r = sd_id128_get_boot(&boot_id);
210 if (sd_id128_equal(boot_id, f->header->boot_id))
211 f->tail_entry_monotonic_valid = true;
213 f->header->boot_id = boot_id;
215 journal_file_set_online(f);
217 /* Sync the online state to disk */
223 static int journal_file_verify_header(JournalFile *f) {
228 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
231 /* In both read and write mode we refuse to open files with
232 * incompatible flags we don't know */
233 flags = le32toh(f->header->incompatible_flags);
234 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
235 if (flags & ~HEADER_INCOMPATIBLE_ANY)
236 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
237 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
238 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 log_debug("Journal file %s uses incompatible flags %"PRIx32
241 " disabled at compilation time.", f->path, flags);
242 return -EPROTONOSUPPORT;
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
247 flags = le32toh(f->header->compatible_flags);
248 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
249 if (flags & ~HEADER_COMPATIBLE_ANY)
250 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
251 f->path, flags & ~HEADER_COMPATIBLE_ANY);
252 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 log_debug("Journal file %s uses compatible flags %"PRIx32
255 " disabled at compilation time.", f->path, flags);
256 return -EPROTONOSUPPORT;
259 if (f->header->state >= _STATE_MAX)
262 /* The first addition was n_data, so check that we are at least this large */
263 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
266 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
269 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
272 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
275 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
276 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->tail_object_offset)) ||
278 !VALID64(le64toh(f->header->entry_array_offset)))
283 sd_id128_t machine_id;
286 r = sd_id128_get_machine(&machine_id);
290 if (!sd_id128_equal(machine_id, f->header->machine_id))
293 state = f->header->state;
295 if (state == STATE_ONLINE) {
296 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298 } else if (state == STATE_ARCHIVED)
300 else if (state != STATE_OFFLINE) {
301 log_debug("Journal file %s has unknown state %u.", f->path, state);
306 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
307 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
309 f->seal = JOURNAL_HEADER_SEALED(f->header);
314 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
315 uint64_t old_size, new_size;
320 /* We assume that this file is not sparse, and we know that
321 * for sure, since we always call posix_fallocate()
325 le64toh(f->header->header_size) +
326 le64toh(f->header->arena_size);
328 new_size = PAGE_ALIGN(offset + size);
329 if (new_size < le64toh(f->header->header_size))
330 new_size = le64toh(f->header->header_size);
332 if (new_size <= old_size)
335 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
338 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
341 if (fstatvfs(f->fd, &svfs) >= 0) {
344 available = svfs.f_bfree * svfs.f_bsize;
346 if (available >= f->metrics.keep_free)
347 available -= f->metrics.keep_free;
351 if (new_size - old_size > available)
356 /* Increase by larger blocks at once */
357 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
358 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
359 new_size = f->metrics.max_size;
361 /* Note that the glibc fallocate() fallback is very
362 inefficient, hence we try to minimize the allocation area
364 r = posix_fallocate(f->fd, old_size, new_size - old_size);
368 if (fstat(f->fd, &f->last_stat) < 0)
371 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
376 static unsigned type_to_context(ObjectType type) {
377 /* One context for each type, plus one catch-all for the rest */
378 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
379 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
382 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
389 /* Avoid SIGBUS on invalid accesses */
390 if (offset + size > (uint64_t) f->last_stat.st_size) {
391 /* Hmm, out of range? Let's refresh the fstat() data
392 * first, before we trust that check. */
394 if (fstat(f->fd, &f->last_stat) < 0 ||
395 offset + size > (uint64_t) f->last_stat.st_size)
396 return -EADDRNOTAVAIL;
399 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
402 static uint64_t minimum_header_size(Object *o) {
404 static const uint64_t table[] = {
405 [OBJECT_DATA] = sizeof(DataObject),
406 [OBJECT_FIELD] = sizeof(FieldObject),
407 [OBJECT_ENTRY] = sizeof(EntryObject),
408 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411 [OBJECT_TAG] = sizeof(TagObject),
414 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415 return sizeof(ObjectHeader);
417 return table[o->object.type];
420 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
429 /* Objects may only be located at multiple of 64 bit */
430 if (!VALID64(offset))
433 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
438 s = le64toh(o->object.size);
440 if (s < sizeof(ObjectHeader))
443 if (o->object.type <= OBJECT_UNUSED)
446 if (s < minimum_header_size(o))
449 if (type > OBJECT_UNUSED && o->object.type != type)
452 if (s > sizeof(ObjectHeader)) {
453 r = journal_file_move_to(f, type, false, offset, s, &t);
464 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
469 r = le64toh(f->header->tail_entry_seqnum) + 1;
472 /* If an external seqnum counter was passed, we update
473 * both the local and the external one, and set it to
474 * the maximum of both */
482 f->header->tail_entry_seqnum = htole64(r);
484 if (f->header->head_entry_seqnum == 0)
485 f->header->head_entry_seqnum = htole64(r);
490 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
497 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
498 assert(size >= sizeof(ObjectHeader));
502 r = journal_file_set_online(f);
506 p = le64toh(f->header->tail_object_offset);
508 p = le64toh(f->header->header_size);
510 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
514 p += ALIGN64(le64toh(tail->object.size));
517 r = journal_file_allocate(f, p, size);
521 r = journal_file_move_to(f, type, false, p, size, &t);
528 o->object.type = type;
529 o->object.size = htole64(size);
531 f->header->tail_object_offset = htole64(p);
532 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
540 static int journal_file_setup_data_hash_table(JournalFile *f) {
547 /* We estimate that we need 1 hash table entry per 768 of
548 journal file and we want to make sure we never get beyond
549 75% fill level. Calculate the hash table size for the
550 maximum file size based on these metrics. */
552 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
553 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
554 s = DEFAULT_DATA_HASH_TABLE_SIZE;
556 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
558 r = journal_file_append_object(f,
559 OBJECT_DATA_HASH_TABLE,
560 offsetof(Object, hash_table.items) + s,
565 memzero(o->hash_table.items, s);
567 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
568 f->header->data_hash_table_size = htole64(s);
573 static int journal_file_setup_field_hash_table(JournalFile *f) {
580 /* We use a fixed size hash table for the fields as this
581 * number should grow very slowly only */
583 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
584 r = journal_file_append_object(f,
585 OBJECT_FIELD_HASH_TABLE,
586 offsetof(Object, hash_table.items) + s,
591 memzero(o->hash_table.items, s);
593 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
594 f->header->field_hash_table_size = htole64(s);
599 static int journal_file_map_data_hash_table(JournalFile *f) {
606 p = le64toh(f->header->data_hash_table_offset);
607 s = le64toh(f->header->data_hash_table_size);
609 r = journal_file_move_to(f,
610 OBJECT_DATA_HASH_TABLE,
617 f->data_hash_table = t;
621 static int journal_file_map_field_hash_table(JournalFile *f) {
628 p = le64toh(f->header->field_hash_table_offset);
629 s = le64toh(f->header->field_hash_table_size);
631 r = journal_file_move_to(f,
632 OBJECT_FIELD_HASH_TABLE,
639 f->field_hash_table = t;
643 static int journal_file_link_field(
656 if (o->object.type != OBJECT_FIELD)
659 /* This might alter the window we are looking at */
661 o->field.next_hash_offset = o->field.head_data_offset = 0;
663 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
664 p = le64toh(f->field_hash_table[h].tail_hash_offset);
666 f->field_hash_table[h].head_hash_offset = htole64(offset);
668 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
672 o->field.next_hash_offset = htole64(offset);
675 f->field_hash_table[h].tail_hash_offset = htole64(offset);
677 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
678 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
683 static int journal_file_link_data(
696 if (o->object.type != OBJECT_DATA)
699 /* This might alter the window we are looking at */
701 o->data.next_hash_offset = o->data.next_field_offset = 0;
702 o->data.entry_offset = o->data.entry_array_offset = 0;
703 o->data.n_entries = 0;
705 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
706 p = le64toh(f->data_hash_table[h].tail_hash_offset);
708 /* Only entry in the hash table is easy */
709 f->data_hash_table[h].head_hash_offset = htole64(offset);
711 /* Move back to the previous data object, to patch in
714 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
718 o->data.next_hash_offset = htole64(offset);
721 f->data_hash_table[h].tail_hash_offset = htole64(offset);
723 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
724 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
729 int journal_file_find_field_object_with_hash(
731 const void *field, uint64_t size, uint64_t hash,
732 Object **ret, uint64_t *offset) {
734 uint64_t p, osize, h;
738 assert(field && size > 0);
740 osize = offsetof(Object, field.payload) + size;
742 if (f->header->field_hash_table_size == 0)
745 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
746 p = le64toh(f->field_hash_table[h].head_hash_offset);
751 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
755 if (le64toh(o->field.hash) == hash &&
756 le64toh(o->object.size) == osize &&
757 memcmp(o->field.payload, field, size) == 0) {
767 p = le64toh(o->field.next_hash_offset);
773 int journal_file_find_field_object(
775 const void *field, uint64_t size,
776 Object **ret, uint64_t *offset) {
781 assert(field && size > 0);
783 hash = hash64(field, size);
785 return journal_file_find_field_object_with_hash(f,
790 int journal_file_find_data_object_with_hash(
792 const void *data, uint64_t size, uint64_t hash,
793 Object **ret, uint64_t *offset) {
795 uint64_t p, osize, h;
799 assert(data || size == 0);
801 osize = offsetof(Object, data.payload) + size;
803 if (f->header->data_hash_table_size == 0)
806 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
807 p = le64toh(f->data_hash_table[h].head_hash_offset);
812 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
816 if (le64toh(o->data.hash) != hash)
819 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
820 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
824 l = le64toh(o->object.size);
825 if (l <= offsetof(Object, data.payload))
828 l -= offsetof(Object, data.payload);
830 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
831 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
836 memcmp(f->compress_buffer, data, size) == 0) {
847 return -EPROTONOSUPPORT;
849 } else if (le64toh(o->object.size) == osize &&
850 memcmp(o->data.payload, data, size) == 0) {
862 p = le64toh(o->data.next_hash_offset);
868 int journal_file_find_data_object(
870 const void *data, uint64_t size,
871 Object **ret, uint64_t *offset) {
876 assert(data || size == 0);
878 hash = hash64(data, size);
880 return journal_file_find_data_object_with_hash(f,
885 static int journal_file_append_field(
887 const void *field, uint64_t size,
888 Object **ret, uint64_t *offset) {
896 assert(field && size > 0);
898 hash = hash64(field, size);
900 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
914 osize = offsetof(Object, field.payload) + size;
915 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
919 o->field.hash = htole64(hash);
920 memcpy(o->field.payload, field, size);
922 r = journal_file_link_field(f, o, p, hash);
926 /* The linking might have altered the window, so let's
927 * refresh our pointer */
928 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
933 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
947 static int journal_file_append_data(
949 const void *data, uint64_t size,
950 Object **ret, uint64_t *offset) {
955 int r, compression = 0;
959 assert(data || size == 0);
961 hash = hash64(data, size);
963 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
977 osize = offsetof(Object, data.payload) + size;
978 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
982 o->data.hash = htole64(hash);
984 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
985 if (f->compress_xz &&
986 size >= COMPRESSION_SIZE_THRESHOLD) {
989 compression = compress_blob(data, size, o->data.payload, &rsize);
992 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
993 o->object.flags |= compression;
995 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
996 size, rsize, object_compressed_to_string(compression));
1001 if (!compression && size > 0)
1002 memcpy(o->data.payload, data, size);
1004 r = journal_file_link_data(f, o, p, hash);
1008 /* The linking might have altered the window, so let's
1009 * refresh our pointer */
1010 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1017 eq = memchr(data, '=', size);
1018 if (eq && eq > data) {
1022 /* Create field object ... */
1023 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1027 /* ... and link it in. */
1028 o->data.next_field_offset = fo->field.head_data_offset;
1029 fo->field.head_data_offset = le64toh(p);
1033 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1047 uint64_t journal_file_entry_n_items(Object *o) {
1050 if (o->object.type != OBJECT_ENTRY)
1053 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1056 uint64_t journal_file_entry_array_n_items(Object *o) {
1059 if (o->object.type != OBJECT_ENTRY_ARRAY)
1062 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1065 uint64_t journal_file_hash_table_n_items(Object *o) {
1068 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1069 o->object.type != OBJECT_FIELD_HASH_TABLE)
1072 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1075 static int link_entry_into_array(JournalFile *f,
1080 uint64_t n = 0, ap = 0, q, i, a, hidx;
1088 a = le64toh(*first);
1089 i = hidx = le64toh(*idx);
1092 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1096 n = journal_file_entry_array_n_items(o);
1098 o->entry_array.items[i] = htole64(p);
1099 *idx = htole64(hidx + 1);
1105 a = le64toh(o->entry_array.next_entry_array_offset);
1116 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1117 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1123 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1128 o->entry_array.items[i] = htole64(p);
1131 *first = htole64(q);
1133 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1137 o->entry_array.next_entry_array_offset = htole64(q);
1140 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1141 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1143 *idx = htole64(hidx + 1);
1148 static int link_entry_into_array_plus_one(JournalFile *f,
1163 *extra = htole64(p);
1167 i = htole64(le64toh(*idx) - 1);
1168 r = link_entry_into_array(f, first, &i, p);
1173 *idx = htole64(le64toh(*idx) + 1);
1177 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1184 p = le64toh(o->entry.items[i].object_offset);
1188 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1192 return link_entry_into_array_plus_one(f,
1193 &o->data.entry_offset,
1194 &o->data.entry_array_offset,
1199 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1207 if (o->object.type != OBJECT_ENTRY)
1210 __sync_synchronize();
1212 /* Link up the entry itself */
1213 r = link_entry_into_array(f,
1214 &f->header->entry_array_offset,
1215 &f->header->n_entries,
1220 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1222 if (f->header->head_entry_realtime == 0)
1223 f->header->head_entry_realtime = o->entry.realtime;
1225 f->header->tail_entry_realtime = o->entry.realtime;
1226 f->header->tail_entry_monotonic = o->entry.monotonic;
1228 f->tail_entry_monotonic_valid = true;
1230 /* Link up the items */
1231 n = journal_file_entry_n_items(o);
1232 for (i = 0; i < n; i++) {
1233 r = journal_file_link_entry_item(f, o, offset, i);
1241 static int journal_file_append_entry_internal(
1243 const dual_timestamp *ts,
1245 const EntryItem items[], unsigned n_items,
1247 Object **ret, uint64_t *offset) {
1254 assert(items || n_items == 0);
1257 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1259 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1263 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1264 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1265 o->entry.realtime = htole64(ts->realtime);
1266 o->entry.monotonic = htole64(ts->monotonic);
1267 o->entry.xor_hash = htole64(xor_hash);
1268 o->entry.boot_id = f->header->boot_id;
1271 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1276 r = journal_file_link_entry(f, o, np);
1289 void journal_file_post_change(JournalFile *f) {
1292 /* inotify() does not receive IN_MODIFY events from file
1293 * accesses done via mmap(). After each access we hence
1294 * trigger IN_MODIFY by truncating the journal file to its
1295 * current size which triggers IN_MODIFY. */
1297 __sync_synchronize();
1299 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1300 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1303 static int entry_item_cmp(const void *_a, const void *_b) {
1304 const EntryItem *a = _a, *b = _b;
1306 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1308 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1313 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1317 uint64_t xor_hash = 0;
1318 struct dual_timestamp _ts;
1321 assert(iovec || n_iovec == 0);
1324 dual_timestamp_get(&_ts);
1328 if (f->tail_entry_monotonic_valid &&
1329 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1333 r = journal_file_maybe_append_tag(f, ts->realtime);
1338 /* alloca() can't take 0, hence let's allocate at least one */
1339 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1341 for (i = 0; i < n_iovec; i++) {
1345 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1349 xor_hash ^= le64toh(o->data.hash);
1350 items[i].object_offset = htole64(p);
1351 items[i].hash = o->data.hash;
1354 /* Order by the position on disk, in order to improve seek
1355 * times for rotating media. */
1356 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1358 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1360 journal_file_post_change(f);
1365 typedef struct ChainCacheItem {
1366 uint64_t first; /* the array at the beginning of the chain */
1367 uint64_t array; /* the cached array */
1368 uint64_t begin; /* the first item in the cached array */
1369 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1370 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1373 static void chain_cache_put(
1380 uint64_t last_index) {
1383 /* If the chain item to cache for this chain is the
1384 * first one it's not worth caching anything */
1388 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1389 ci = ordered_hashmap_steal_first(h);
1392 ci = new(ChainCacheItem, 1);
1399 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1404 assert(ci->first == first);
1409 ci->last_index = last_index;
1412 static int generic_array_get(
1416 Object **ret, uint64_t *offset) {
1419 uint64_t p = 0, a, t = 0;
1427 /* Try the chain cache first */
1428 ci = ordered_hashmap_get(f->chain_cache, &first);
1429 if (ci && i > ci->total) {
1438 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1442 k = journal_file_entry_array_n_items(o);
1444 p = le64toh(o->entry_array.items[i]);
1450 a = le64toh(o->entry_array.next_entry_array_offset);
1456 /* Let's cache this item for the next invocation */
1457 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1459 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1472 static int generic_array_get_plus_one(
1477 Object **ret, uint64_t *offset) {
1486 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1499 return generic_array_get(f, first, i-1, ret, offset);
1508 static int generic_array_bisect(
1513 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1514 direction_t direction,
1519 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1520 bool subtract_one = false;
1521 Object *o, *array = NULL;
1526 assert(test_object);
1528 /* Start with the first array in the chain */
1531 ci = ordered_hashmap_get(f->chain_cache, &first);
1532 if (ci && n > ci->total) {
1533 /* Ah, we have iterated this bisection array chain
1534 * previously! Let's see if we can skip ahead in the
1535 * chain, as far as the last time. But we can't jump
1536 * backwards in the chain, so let's check that
1539 r = test_object(f, ci->begin, needle);
1543 if (r == TEST_LEFT) {
1544 /* OK, what we are looking for is right of the
1545 * begin of this EntryArray, so let's jump
1546 * straight to previously cached array in the
1552 last_index = ci->last_index;
1557 uint64_t left, right, k, lp;
1559 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1563 k = journal_file_entry_array_n_items(array);
1569 lp = p = le64toh(array->entry_array.items[i]);
1573 r = test_object(f, p, needle);
1577 if (r == TEST_FOUND)
1578 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1580 if (r == TEST_RIGHT) {
1584 if (last_index != (uint64_t) -1) {
1585 assert(last_index <= right);
1587 /* If we cached the last index we
1588 * looked at, let's try to not to jump
1589 * too wildly around and see if we can
1590 * limit the range to look at early to
1591 * the immediate neighbors of the last
1592 * index we looked at. */
1594 if (last_index > 0) {
1595 uint64_t x = last_index - 1;
1597 p = le64toh(array->entry_array.items[x]);
1601 r = test_object(f, p, needle);
1605 if (r == TEST_FOUND)
1606 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1608 if (r == TEST_RIGHT)
1614 if (last_index < right) {
1615 uint64_t y = last_index + 1;
1617 p = le64toh(array->entry_array.items[y]);
1621 r = test_object(f, p, needle);
1625 if (r == TEST_FOUND)
1626 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1628 if (r == TEST_RIGHT)
1636 if (left == right) {
1637 if (direction == DIRECTION_UP)
1638 subtract_one = true;
1644 assert(left < right);
1645 i = (left + right) / 2;
1647 p = le64toh(array->entry_array.items[i]);
1651 r = test_object(f, p, needle);
1655 if (r == TEST_FOUND)
1656 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1658 if (r == TEST_RIGHT)
1666 if (direction == DIRECTION_UP) {
1668 subtract_one = true;
1679 last_index = (uint64_t) -1;
1680 a = le64toh(array->entry_array.next_entry_array_offset);
1686 if (subtract_one && t == 0 && i == 0)
1689 /* Let's cache this item for the next invocation */
1690 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1692 if (subtract_one && i == 0)
1694 else if (subtract_one)
1695 p = le64toh(array->entry_array.items[i-1]);
1697 p = le64toh(array->entry_array.items[i]);
1699 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1710 *idx = t + i + (subtract_one ? -1 : 0);
1716 static int generic_array_bisect_plus_one(
1722 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723 direction_t direction,
1729 bool step_back = false;
1733 assert(test_object);
1738 /* This bisects the array in object 'first', but first checks
1740 r = test_object(f, extra, needle);
1744 if (r == TEST_FOUND)
1745 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1747 /* if we are looking with DIRECTION_UP then we need to first
1748 see if in the actual array there is a matching entry, and
1749 return the last one of that. But if there isn't any we need
1750 to return this one. Hence remember this, and return it
1753 step_back = direction == DIRECTION_UP;
1755 if (r == TEST_RIGHT) {
1756 if (direction == DIRECTION_DOWN)
1762 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1764 if (r == 0 && step_back)
1773 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1789 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1795 else if (p < needle)
1801 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1808 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1812 if (le64toh(o->entry.seqnum) == needle)
1814 else if (le64toh(o->entry.seqnum) < needle)
1820 int journal_file_move_to_entry_by_seqnum(
1823 direction_t direction,
1827 return generic_array_bisect(f,
1828 le64toh(f->header->entry_array_offset),
1829 le64toh(f->header->n_entries),
1836 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1843 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1847 if (le64toh(o->entry.realtime) == needle)
1849 else if (le64toh(o->entry.realtime) < needle)
1855 int journal_file_move_to_entry_by_realtime(
1858 direction_t direction,
1862 return generic_array_bisect(f,
1863 le64toh(f->header->entry_array_offset),
1864 le64toh(f->header->n_entries),
1866 test_object_realtime,
1871 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1878 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1882 if (le64toh(o->entry.monotonic) == needle)
1884 else if (le64toh(o->entry.monotonic) < needle)
1890 static inline int find_data_object_by_boot_id(
1895 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1897 sd_id128_to_string(boot_id, t + 9);
1898 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1901 int journal_file_move_to_entry_by_monotonic(
1905 direction_t direction,
1914 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1920 return generic_array_bisect_plus_one(f,
1921 le64toh(o->data.entry_offset),
1922 le64toh(o->data.entry_array_offset),
1923 le64toh(o->data.n_entries),
1925 test_object_monotonic,
1930 void journal_file_reset_location(JournalFile *f) {
1931 f->location_type = LOCATION_HEAD;
1932 f->current_offset = 0;
1933 f->current_seqnum = 0;
1934 f->current_realtime = 0;
1935 f->current_monotonic = 0;
1936 zero(f->current_boot_id);
1937 f->current_xor_hash = 0;
1940 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1941 f->last_direction = direction;
1942 f->location_type = LOCATION_SEEK;
1943 f->current_offset = offset;
1944 f->current_seqnum = le64toh(o->entry.seqnum);
1945 f->current_realtime = le64toh(o->entry.realtime);
1946 f->current_monotonic = le64toh(o->entry.monotonic);
1947 f->current_boot_id = o->entry.boot_id;
1948 f->current_xor_hash = le64toh(o->entry.xor_hash);
1951 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1954 assert(af->location_type == LOCATION_SEEK);
1955 assert(bf->location_type == LOCATION_SEEK);
1957 /* If contents and timestamps match, these entries are
1958 * identical, even if the seqnum does not match */
1959 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1960 af->current_monotonic == bf->current_monotonic &&
1961 af->current_realtime == bf->current_realtime &&
1962 af->current_xor_hash == bf->current_xor_hash)
1965 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1967 /* If this is from the same seqnum source, compare
1969 if (af->current_seqnum < bf->current_seqnum)
1971 if (af->current_seqnum > bf->current_seqnum)
1974 /* Wow! This is weird, different data but the same
1975 * seqnums? Something is borked, but let's make the
1976 * best of it and compare by time. */
1979 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
1981 /* If the boot id matches, compare monotonic time */
1982 if (af->current_monotonic < bf->current_monotonic)
1984 if (af->current_monotonic > bf->current_monotonic)
1988 /* Otherwise, compare UTC time */
1989 if (af->current_realtime < bf->current_realtime)
1991 if (af->current_realtime > bf->current_realtime)
1994 /* Finally, compare by contents */
1995 if (af->current_xor_hash < bf->current_xor_hash)
1997 if (af->current_xor_hash > bf->current_xor_hash)
2003 int journal_file_next_entry(
2006 direction_t direction,
2007 Object **ret, uint64_t *offset) {
2014 n = le64toh(f->header->n_entries);
2019 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2021 r = generic_array_bisect(f,
2022 le64toh(f->header->entry_array_offset),
2023 le64toh(f->header->n_entries),
2032 if (direction == DIRECTION_DOWN) {
2045 /* And jump to it */
2046 r = generic_array_get(f,
2047 le64toh(f->header->entry_array_offset),
2054 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2055 log_debug("%s: entry array corrupted at entry %"PRIu64,
2066 int journal_file_next_entry_for_data(
2068 Object *o, uint64_t p,
2069 uint64_t data_offset,
2070 direction_t direction,
2071 Object **ret, uint64_t *offset) {
2078 assert(p > 0 || !o);
2080 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2084 n = le64toh(d->data.n_entries);
2089 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2091 if (o->object.type != OBJECT_ENTRY)
2094 r = generic_array_bisect_plus_one(f,
2095 le64toh(d->data.entry_offset),
2096 le64toh(d->data.entry_array_offset),
2097 le64toh(d->data.n_entries),
2107 if (direction == DIRECTION_DOWN) {
2121 return generic_array_get_plus_one(f,
2122 le64toh(d->data.entry_offset),
2123 le64toh(d->data.entry_array_offset),
2128 int journal_file_move_to_entry_by_offset_for_data(
2130 uint64_t data_offset,
2132 direction_t direction,
2133 Object **ret, uint64_t *offset) {
2140 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2144 return generic_array_bisect_plus_one(f,
2145 le64toh(d->data.entry_offset),
2146 le64toh(d->data.entry_array_offset),
2147 le64toh(d->data.n_entries),
2154 int journal_file_move_to_entry_by_monotonic_for_data(
2156 uint64_t data_offset,
2159 direction_t direction,
2160 Object **ret, uint64_t *offset) {
2168 /* First, seek by time */
2169 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2175 r = generic_array_bisect_plus_one(f,
2176 le64toh(o->data.entry_offset),
2177 le64toh(o->data.entry_array_offset),
2178 le64toh(o->data.n_entries),
2180 test_object_monotonic,
2186 /* And now, continue seeking until we find an entry that
2187 * exists in both bisection arrays */
2193 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2197 r = generic_array_bisect_plus_one(f,
2198 le64toh(d->data.entry_offset),
2199 le64toh(d->data.entry_array_offset),
2200 le64toh(d->data.n_entries),
2208 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2212 r = generic_array_bisect_plus_one(f,
2213 le64toh(o->data.entry_offset),
2214 le64toh(o->data.entry_array_offset),
2215 le64toh(o->data.n_entries),
2237 int journal_file_move_to_entry_by_seqnum_for_data(
2239 uint64_t data_offset,
2241 direction_t direction,
2242 Object **ret, uint64_t *offset) {
2249 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2253 return generic_array_bisect_plus_one(f,
2254 le64toh(d->data.entry_offset),
2255 le64toh(d->data.entry_array_offset),
2256 le64toh(d->data.n_entries),
2263 int journal_file_move_to_entry_by_realtime_for_data(
2265 uint64_t data_offset,
2267 direction_t direction,
2268 Object **ret, uint64_t *offset) {
2275 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2279 return generic_array_bisect_plus_one(f,
2280 le64toh(d->data.entry_offset),
2281 le64toh(d->data.entry_array_offset),
2282 le64toh(d->data.n_entries),
2284 test_object_realtime,
2289 void journal_file_dump(JournalFile *f) {
2296 journal_file_print_header(f);
2298 p = le64toh(f->header->header_size);
2300 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2304 switch (o->object.type) {
2307 printf("Type: OBJECT_UNUSED\n");
2311 printf("Type: OBJECT_DATA\n");
2315 printf("Type: OBJECT_FIELD\n");
2319 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2320 le64toh(o->entry.seqnum),
2321 le64toh(o->entry.monotonic),
2322 le64toh(o->entry.realtime));
2325 case OBJECT_FIELD_HASH_TABLE:
2326 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2329 case OBJECT_DATA_HASH_TABLE:
2330 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2333 case OBJECT_ENTRY_ARRAY:
2334 printf("Type: OBJECT_ENTRY_ARRAY\n");
2338 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2339 le64toh(o->tag.seqnum),
2340 le64toh(o->tag.epoch));
2344 printf("Type: unknown (%u)\n", o->object.type);
2348 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2349 printf("Flags: %s\n",
2350 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2352 if (p == le64toh(f->header->tail_object_offset))
2355 p = p + ALIGN64(le64toh(o->object.size));
2360 log_error("File corrupt");
2363 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2366 x = format_timestamp(buf, l, t);
2372 void journal_file_print_header(JournalFile *f) {
2373 char a[33], b[33], c[33], d[33];
2374 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2376 char bytes[FORMAT_BYTES_MAX];
2380 printf("File Path: %s\n"
2384 "Sequential Number ID: %s\n"
2386 "Compatible Flags:%s%s\n"
2387 "Incompatible Flags:%s%s%s\n"
2388 "Header size: %"PRIu64"\n"
2389 "Arena size: %"PRIu64"\n"
2390 "Data Hash Table Size: %"PRIu64"\n"
2391 "Field Hash Table Size: %"PRIu64"\n"
2392 "Rotate Suggested: %s\n"
2393 "Head Sequential Number: %"PRIu64"\n"
2394 "Tail Sequential Number: %"PRIu64"\n"
2395 "Head Realtime Timestamp: %s\n"
2396 "Tail Realtime Timestamp: %s\n"
2397 "Tail Monotonic Timestamp: %s\n"
2398 "Objects: %"PRIu64"\n"
2399 "Entry Objects: %"PRIu64"\n",
2401 sd_id128_to_string(f->header->file_id, a),
2402 sd_id128_to_string(f->header->machine_id, b),
2403 sd_id128_to_string(f->header->boot_id, c),
2404 sd_id128_to_string(f->header->seqnum_id, d),
2405 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2406 f->header->state == STATE_ONLINE ? "ONLINE" :
2407 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2408 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2409 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2410 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2411 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2412 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2413 le64toh(f->header->header_size),
2414 le64toh(f->header->arena_size),
2415 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2416 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2417 yes_no(journal_file_rotate_suggested(f, 0)),
2418 le64toh(f->header->head_entry_seqnum),
2419 le64toh(f->header->tail_entry_seqnum),
2420 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2421 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2422 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2423 le64toh(f->header->n_objects),
2424 le64toh(f->header->n_entries));
2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2427 printf("Data Objects: %"PRIu64"\n"
2428 "Data Hash Table Fill: %.1f%%\n",
2429 le64toh(f->header->n_data),
2430 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2432 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2433 printf("Field Objects: %"PRIu64"\n"
2434 "Field Hash Table Fill: %.1f%%\n",
2435 le64toh(f->header->n_fields),
2436 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2438 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2439 printf("Tag Objects: %"PRIu64"\n",
2440 le64toh(f->header->n_tags));
2441 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2442 printf("Entry Array Objects: %"PRIu64"\n",
2443 le64toh(f->header->n_entry_arrays));
2445 if (fstat(f->fd, &st) >= 0)
2446 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2449 int journal_file_open(
2455 JournalMetrics *metrics,
2456 MMapCache *mmap_cache,
2457 JournalFile *template,
2458 JournalFile **ret) {
2462 bool newly_created = false;
2467 if ((flags & O_ACCMODE) != O_RDONLY &&
2468 (flags & O_ACCMODE) != O_RDWR)
2471 if (!endswith(fname, ".journal") &&
2472 !endswith(fname, ".journal~"))
2475 f = new0(JournalFile, 1);
2483 f->prot = prot_from_flags(flags);
2484 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2485 #if defined(HAVE_LZ4)
2486 f->compress_lz4 = compress;
2487 #elif defined(HAVE_XZ)
2488 f->compress_xz = compress;
2495 f->mmap = mmap_cache_ref(mmap_cache);
2497 f->mmap = mmap_cache_new();
2504 f->path = strdup(fname);
2510 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2511 if (!f->chain_cache) {
2516 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2522 if (fstat(f->fd, &f->last_stat) < 0) {
2527 if (f->last_stat.st_size == 0 && f->writable) {
2528 /* Let's attach the creation time to the journal file,
2529 * so that the vacuuming code knows the age of this
2530 * file even if the file might end up corrupted one
2531 * day... Ideally we'd just use the creation time many
2532 * file systems maintain for each file, but there is
2533 * currently no usable API to query this, hence let's
2534 * emulate this via extended attributes. If extended
2535 * attributes are not supported we'll just skip this,
2536 * and rely solely on mtime/atime/ctime of the file. */
2538 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2541 /* Try to load the FSPRG state, and if we can't, then
2542 * just don't do sealing */
2544 r = journal_file_fss_load(f);
2550 r = journal_file_init_header(f, template);
2554 if (fstat(f->fd, &f->last_stat) < 0) {
2559 newly_created = true;
2562 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2567 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2568 if (f->header == MAP_FAILED) {
2574 if (!newly_created) {
2575 r = journal_file_verify_header(f);
2581 if (!newly_created && f->writable) {
2582 r = journal_file_fss_load(f);
2590 journal_default_metrics(metrics, f->fd);
2591 f->metrics = *metrics;
2592 } else if (template)
2593 f->metrics = template->metrics;
2595 r = journal_file_refresh_header(f);
2601 r = journal_file_hmac_setup(f);
2606 if (newly_created) {
2607 r = journal_file_setup_field_hash_table(f);
2611 r = journal_file_setup_data_hash_table(f);
2616 r = journal_file_append_first_tag(f);
2622 r = journal_file_map_field_hash_table(f);
2626 r = journal_file_map_data_hash_table(f);
2634 journal_file_close(f);
2639 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2640 _cleanup_free_ char *p = NULL;
2642 JournalFile *old_file, *new_file = NULL;
2650 if (!old_file->writable)
2653 if (!endswith(old_file->path, ".journal"))
2656 l = strlen(old_file->path);
2657 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2658 (int) l - 8, old_file->path,
2659 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2660 le64toh((*f)->header->head_entry_seqnum),
2661 le64toh((*f)->header->head_entry_realtime));
2665 r = rename(old_file->path, p);
2669 old_file->header->state = STATE_ARCHIVED;
2671 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2672 journal_file_close(old_file);
2678 int journal_file_open_reliably(
2684 JournalMetrics *metrics,
2685 MMapCache *mmap_cache,
2686 JournalFile *template,
2687 JournalFile **ret) {
2691 _cleanup_free_ char *p = NULL;
2693 r = journal_file_open(fname, flags, mode, compress, seal,
2694 metrics, mmap_cache, template, ret);
2695 if (r != -EBADMSG && /* corrupted */
2696 r != -ENODATA && /* truncated */
2697 r != -EHOSTDOWN && /* other machine */
2698 r != -EPROTONOSUPPORT && /* incompatible feature */
2699 r != -EBUSY && /* unclean shutdown */
2700 r != -ESHUTDOWN /* already archived */)
2703 if ((flags & O_ACCMODE) == O_RDONLY)
2706 if (!(flags & O_CREAT))
2709 if (!endswith(fname, ".journal"))
2712 /* The file is corrupted. Rotate it away and try it again (but only once) */
2715 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2717 (unsigned long long) now(CLOCK_REALTIME),
2721 r = rename(fname, p);
2725 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2727 return journal_file_open(fname, flags, mode, compress, seal,
2728 metrics, mmap_cache, template, ret);
2731 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2733 uint64_t q, xor_hash = 0;
2746 ts.monotonic = le64toh(o->entry.monotonic);
2747 ts.realtime = le64toh(o->entry.realtime);
2749 n = journal_file_entry_n_items(o);
2750 /* alloca() can't take 0, hence let's allocate at least one */
2751 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2753 for (i = 0; i < n; i++) {
2760 q = le64toh(o->entry.items[i].object_offset);
2761 le_hash = o->entry.items[i].hash;
2763 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2767 if (le_hash != o->data.hash)
2770 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2773 /* We hit the limit on 32bit machines */
2774 if ((uint64_t) t != l)
2777 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2778 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2781 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2782 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2786 data = from->compress_buffer;
2789 return -EPROTONOSUPPORT;
2792 data = o->data.payload;
2794 r = journal_file_append_data(to, data, l, &u, &h);
2798 xor_hash ^= le64toh(u->data.hash);
2799 items[i].object_offset = htole64(h);
2800 items[i].hash = u->data.hash;
2802 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2807 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2810 void journal_default_metrics(JournalMetrics *m, int fd) {
2811 uint64_t fs_size = 0;
2813 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2818 if (fstatvfs(fd, &ss) >= 0)
2819 fs_size = ss.f_frsize * ss.f_blocks;
2821 if (m->max_use == (uint64_t) -1) {
2824 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2826 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2827 m->max_use = DEFAULT_MAX_USE_UPPER;
2829 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2830 m->max_use = DEFAULT_MAX_USE_LOWER;
2832 m->max_use = DEFAULT_MAX_USE_LOWER;
2834 m->max_use = PAGE_ALIGN(m->max_use);
2836 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2837 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2840 if (m->max_size == (uint64_t) -1) {
2841 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2843 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2844 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2846 m->max_size = PAGE_ALIGN(m->max_size);
2848 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2849 m->max_size = JOURNAL_FILE_SIZE_MIN;
2851 if (m->max_size*2 > m->max_use)
2852 m->max_use = m->max_size*2;
2854 if (m->min_size == (uint64_t) -1)
2855 m->min_size = JOURNAL_FILE_SIZE_MIN;
2857 m->min_size = PAGE_ALIGN(m->min_size);
2859 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2860 m->min_size = JOURNAL_FILE_SIZE_MIN;
2862 if (m->min_size > m->max_size)
2863 m->max_size = m->min_size;
2866 if (m->keep_free == (uint64_t) -1) {
2869 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2871 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2872 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2875 m->keep_free = DEFAULT_KEEP_FREE;
2878 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2879 format_bytes(a, sizeof(a), m->max_use),
2880 format_bytes(b, sizeof(b), m->max_size),
2881 format_bytes(c, sizeof(c), m->min_size),
2882 format_bytes(d, sizeof(d), m->keep_free));
2885 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2890 if (f->header->head_entry_realtime == 0)
2893 *from = le64toh(f->header->head_entry_realtime);
2897 if (f->header->tail_entry_realtime == 0)
2900 *to = le64toh(f->header->tail_entry_realtime);
2906 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2914 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2918 if (le64toh(o->data.n_entries) <= 0)
2922 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2926 *from = le64toh(o->entry.monotonic);
2930 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2934 r = generic_array_get_plus_one(f,
2935 le64toh(o->data.entry_offset),
2936 le64toh(o->data.entry_array_offset),
2937 le64toh(o->data.n_entries)-1,
2942 *to = le64toh(o->entry.monotonic);
2948 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2951 /* If we gained new header fields we gained new features,
2952 * hence suggest a rotation */
2953 if (le64toh(f->header->header_size) < sizeof(Header)) {
2954 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2958 /* Let's check if the hash tables grew over a certain fill
2959 * level (75%, borrowing this value from Java's hash table
2960 * implementation), and if so suggest a rotation. To calculate
2961 * the fill level we need the n_data field, which only exists
2962 * in newer versions. */
2964 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2965 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2966 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2968 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2969 le64toh(f->header->n_data),
2970 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2971 (unsigned long long) f->last_stat.st_size,
2972 f->last_stat.st_size / le64toh(f->header->n_data));
2976 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2977 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2978 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2980 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2981 le64toh(f->header->n_fields),
2982 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2986 /* Are the data objects properly indexed by field objects? */
2987 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2988 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2989 le64toh(f->header->n_data) > 0 &&
2990 le64toh(f->header->n_fields) == 0)
2993 if (max_file_usec > 0) {
2996 h = le64toh(f->header->head_entry_realtime);
2997 t = now(CLOCK_REALTIME);
2999 if (h > 0 && t > h + max_file_usec)