1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "journal-authenticate.h"
37 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 /* n_data was the first entry we added after the initial file format design */
62 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
64 /* How many entries to keep in the entry array chain cache at max */
65 #define CHAIN_CACHE_MAX 20
67 /* How much to increase the journal file size at once each time we allocate something new. */
68 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70 /* Reread fstat() of the file for detecting deletions at least this often */
71 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
73 /* The mmap context to use for the header we pick as one above the last defined typed */
74 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
76 static int journal_file_set_online(JournalFile *f) {
82 if (!(f->fd >= 0 && f->header))
85 if (mmap_cache_got_sigbus(f->mmap, f->fd))
88 switch(f->header->state) {
93 f->header->state = STATE_ONLINE;
102 int journal_file_set_offline(JournalFile *f) {
108 if (!(f->fd >= 0 && f->header))
111 if (f->header->state != STATE_ONLINE)
116 if (mmap_cache_got_sigbus(f->mmap, f->fd))
119 f->header->state = STATE_OFFLINE;
121 if (mmap_cache_got_sigbus(f->mmap, f->fd))
129 void journal_file_close(JournalFile *f) {
133 /* Write the final tag */
134 if (f->seal && f->writable)
135 journal_file_append_tag(f);
138 journal_file_set_offline(f);
140 if (f->mmap && f->fd >= 0)
141 mmap_cache_close_fd(f->mmap, f->fd);
147 mmap_cache_unref(f->mmap);
149 ordered_hashmap_free_free(f->chain_cache);
151 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
152 free(f->compress_buffer);
157 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
158 else if (f->fsprg_state)
159 free(f->fsprg_state);
164 gcry_md_close(f->hmac);
170 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
177 memcpy(h.signature, HEADER_SIGNATURE, 8);
178 h.header_size = htole64(ALIGN64(sizeof(h)));
180 h.incompatible_flags |= htole32(
181 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
182 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
184 h.compatible_flags = htole32(
185 f->seal * HEADER_COMPATIBLE_SEALED);
187 r = sd_id128_randomize(&h.file_id);
192 h.seqnum_id = template->header->seqnum_id;
193 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
195 h.seqnum_id = h.file_id;
197 k = pwrite(f->fd, &h, sizeof(h), 0);
207 static int journal_file_refresh_header(JournalFile *f) {
213 r = sd_id128_get_machine(&f->header->machine_id);
217 r = sd_id128_get_boot(&boot_id);
221 if (sd_id128_equal(boot_id, f->header->boot_id))
222 f->tail_entry_monotonic_valid = true;
224 f->header->boot_id = boot_id;
226 r = journal_file_set_online(f);
228 /* Sync the online state to disk */
234 static int journal_file_verify_header(JournalFile *f) {
239 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
242 /* In both read and write mode we refuse to open files with
243 * incompatible flags we don't know */
244 flags = le32toh(f->header->incompatible_flags);
245 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
246 if (flags & ~HEADER_INCOMPATIBLE_ANY)
247 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
248 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
249 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
251 log_debug("Journal file %s uses incompatible flags %"PRIx32
252 " disabled at compilation time.", f->path, flags);
253 return -EPROTONOSUPPORT;
256 /* When open for writing we refuse to open files with
257 * compatible flags, too */
258 flags = le32toh(f->header->compatible_flags);
259 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
260 if (flags & ~HEADER_COMPATIBLE_ANY)
261 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
262 f->path, flags & ~HEADER_COMPATIBLE_ANY);
263 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
265 log_debug("Journal file %s uses compatible flags %"PRIx32
266 " disabled at compilation time.", f->path, flags);
267 return -EPROTONOSUPPORT;
270 if (f->header->state >= _STATE_MAX)
273 /* The first addition was n_data, so check that we are at least this large */
274 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
277 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
280 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
283 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
286 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
287 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
288 !VALID64(le64toh(f->header->tail_object_offset)) ||
289 !VALID64(le64toh(f->header->entry_array_offset)))
294 sd_id128_t machine_id;
297 r = sd_id128_get_machine(&machine_id);
301 if (!sd_id128_equal(machine_id, f->header->machine_id))
304 state = f->header->state;
306 if (state == STATE_ONLINE) {
307 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
309 } else if (state == STATE_ARCHIVED)
311 else if (state != STATE_OFFLINE) {
312 log_debug("Journal file %s has unknown state %u.", f->path, state);
317 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
318 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
320 f->seal = JOURNAL_HEADER_SEALED(f->header);
325 static int journal_file_fstat(JournalFile *f) {
329 if (fstat(f->fd, &f->last_stat) < 0)
332 f->last_stat_usec = now(CLOCK_MONOTONIC);
334 /* Refuse appending to files that are already deleted */
335 if (f->last_stat.st_nlink <= 0)
341 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
342 uint64_t old_size, new_size;
347 /* We assume that this file is not sparse, and we know that
348 * for sure, since we always call posix_fallocate()
351 if (mmap_cache_got_sigbus(f->mmap, f->fd))
355 le64toh(f->header->header_size) +
356 le64toh(f->header->arena_size);
358 new_size = PAGE_ALIGN(offset + size);
359 if (new_size < le64toh(f->header->header_size))
360 new_size = le64toh(f->header->header_size);
362 if (new_size <= old_size) {
364 /* We already pre-allocated enough space, but before
365 * we write to it, let's check with fstat() if the
366 * file got deleted, in order make sure we don't throw
367 * away the data immediately. Don't check fstat() for
368 * all writes though, but only once ever 10s. */
370 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
373 return journal_file_fstat(f);
376 /* Allocate more space. */
378 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
381 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
384 if (fstatvfs(f->fd, &svfs) >= 0) {
387 available = svfs.f_bfree * svfs.f_bsize;
389 if (available >= f->metrics.keep_free)
390 available -= f->metrics.keep_free;
394 if (new_size - old_size > available)
399 /* Increase by larger blocks at once */
400 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
401 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
402 new_size = f->metrics.max_size;
404 /* Note that the glibc fallocate() fallback is very
405 inefficient, hence we try to minimize the allocation area
407 r = posix_fallocate(f->fd, old_size, new_size - old_size);
411 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
413 return journal_file_fstat(f);
416 static unsigned type_to_context(ObjectType type) {
417 /* One context for each type, plus one catch-all for the rest */
418 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
419 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
420 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
423 static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
432 /* Avoid SIGBUS on invalid accesses */
433 if (offset + size > (uint64_t) f->last_stat.st_size) {
434 /* Hmm, out of range? Let's refresh the fstat() data
435 * first, before we trust that check. */
437 r = journal_file_fstat(f);
441 if (offset + size > (uint64_t) f->last_stat.st_size)
442 return -EADDRNOTAVAIL;
445 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
448 static uint64_t minimum_header_size(Object *o) {
450 static const uint64_t table[] = {
451 [OBJECT_DATA] = sizeof(DataObject),
452 [OBJECT_FIELD] = sizeof(FieldObject),
453 [OBJECT_ENTRY] = sizeof(EntryObject),
454 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
455 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
456 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
457 [OBJECT_TAG] = sizeof(TagObject),
460 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
461 return sizeof(ObjectHeader);
463 return table[o->object.type];
466 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
475 /* Objects may only be located at multiple of 64 bit */
476 if (!VALID64(offset))
479 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
484 s = le64toh(o->object.size);
486 if (s < sizeof(ObjectHeader))
489 if (o->object.type <= OBJECT_UNUSED)
492 if (s < minimum_header_size(o))
495 if (type > OBJECT_UNUSED && o->object.type != type)
498 if (s > sizeof(ObjectHeader)) {
499 r = journal_file_move_to(f, type, false, offset, s, &t);
510 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
515 r = le64toh(f->header->tail_entry_seqnum) + 1;
518 /* If an external seqnum counter was passed, we update
519 * both the local and the external one, and set it to
520 * the maximum of both */
528 f->header->tail_entry_seqnum = htole64(r);
530 if (f->header->head_entry_seqnum == 0)
531 f->header->head_entry_seqnum = htole64(r);
536 int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
543 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
544 assert(size >= sizeof(ObjectHeader));
548 r = journal_file_set_online(f);
552 p = le64toh(f->header->tail_object_offset);
554 p = le64toh(f->header->header_size);
556 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
560 p += ALIGN64(le64toh(tail->object.size));
563 r = journal_file_allocate(f, p, size);
567 r = journal_file_move_to(f, type, false, p, size, &t);
574 o->object.type = type;
575 o->object.size = htole64(size);
577 f->header->tail_object_offset = htole64(p);
578 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
586 static int journal_file_setup_data_hash_table(JournalFile *f) {
593 /* We estimate that we need 1 hash table entry per 768 of
594 journal file and we want to make sure we never get beyond
595 75% fill level. Calculate the hash table size for the
596 maximum file size based on these metrics. */
598 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
599 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
600 s = DEFAULT_DATA_HASH_TABLE_SIZE;
602 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
604 r = journal_file_append_object(f,
605 OBJECT_DATA_HASH_TABLE,
606 offsetof(Object, hash_table.items) + s,
611 memzero(o->hash_table.items, s);
613 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
614 f->header->data_hash_table_size = htole64(s);
619 static int journal_file_setup_field_hash_table(JournalFile *f) {
626 /* We use a fixed size hash table for the fields as this
627 * number should grow very slowly only */
629 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
630 r = journal_file_append_object(f,
631 OBJECT_FIELD_HASH_TABLE,
632 offsetof(Object, hash_table.items) + s,
637 memzero(o->hash_table.items, s);
639 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
640 f->header->field_hash_table_size = htole64(s);
645 static int journal_file_map_data_hash_table(JournalFile *f) {
652 p = le64toh(f->header->data_hash_table_offset);
653 s = le64toh(f->header->data_hash_table_size);
655 r = journal_file_move_to(f,
656 OBJECT_DATA_HASH_TABLE,
663 f->data_hash_table = t;
667 static int journal_file_map_field_hash_table(JournalFile *f) {
674 p = le64toh(f->header->field_hash_table_offset);
675 s = le64toh(f->header->field_hash_table_size);
677 r = journal_file_move_to(f,
678 OBJECT_FIELD_HASH_TABLE,
685 f->field_hash_table = t;
689 static int journal_file_link_field(
702 if (o->object.type != OBJECT_FIELD)
705 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
709 /* This might alter the window we are looking at */
710 o->field.next_hash_offset = o->field.head_data_offset = 0;
713 p = le64toh(f->field_hash_table[h].tail_hash_offset);
715 f->field_hash_table[h].head_hash_offset = htole64(offset);
717 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
721 o->field.next_hash_offset = htole64(offset);
724 f->field_hash_table[h].tail_hash_offset = htole64(offset);
726 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
727 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
732 static int journal_file_link_data(
745 if (o->object.type != OBJECT_DATA)
748 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
752 /* This might alter the window we are looking at */
753 o->data.next_hash_offset = o->data.next_field_offset = 0;
754 o->data.entry_offset = o->data.entry_array_offset = 0;
755 o->data.n_entries = 0;
758 p = le64toh(f->data_hash_table[h].tail_hash_offset);
760 /* Only entry in the hash table is easy */
761 f->data_hash_table[h].head_hash_offset = htole64(offset);
763 /* Move back to the previous data object, to patch in
766 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
770 o->data.next_hash_offset = htole64(offset);
773 f->data_hash_table[h].tail_hash_offset = htole64(offset);
775 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
776 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
781 int journal_file_find_field_object_with_hash(
783 const void *field, uint64_t size, uint64_t hash,
784 Object **ret, uint64_t *offset) {
786 uint64_t p, osize, h, m;
790 assert(field && size > 0);
792 osize = offsetof(Object, field.payload) + size;
794 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
800 p = le64toh(f->field_hash_table[h].head_hash_offset);
805 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
809 if (le64toh(o->field.hash) == hash &&
810 le64toh(o->object.size) == osize &&
811 memcmp(o->field.payload, field, size) == 0) {
821 p = le64toh(o->field.next_hash_offset);
827 int journal_file_find_field_object(
829 const void *field, uint64_t size,
830 Object **ret, uint64_t *offset) {
835 assert(field && size > 0);
837 hash = hash64(field, size);
839 return journal_file_find_field_object_with_hash(f,
844 int journal_file_find_data_object_with_hash(
846 const void *data, uint64_t size, uint64_t hash,
847 Object **ret, uint64_t *offset) {
849 uint64_t p, osize, h, m;
853 assert(data || size == 0);
855 osize = offsetof(Object, data.payload) + size;
857 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
862 p = le64toh(f->data_hash_table[h].head_hash_offset);
867 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
871 if (le64toh(o->data.hash) != hash)
874 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
875 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
879 l = le64toh(o->object.size);
880 if (l <= offsetof(Object, data.payload))
883 l -= offsetof(Object, data.payload);
885 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
886 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
891 memcmp(f->compress_buffer, data, size) == 0) {
902 return -EPROTONOSUPPORT;
904 } else if (le64toh(o->object.size) == osize &&
905 memcmp(o->data.payload, data, size) == 0) {
917 p = le64toh(o->data.next_hash_offset);
923 int journal_file_find_data_object(
925 const void *data, uint64_t size,
926 Object **ret, uint64_t *offset) {
931 assert(data || size == 0);
933 hash = hash64(data, size);
935 return journal_file_find_data_object_with_hash(f,
940 static int journal_file_append_field(
942 const void *field, uint64_t size,
943 Object **ret, uint64_t *offset) {
951 assert(field && size > 0);
953 hash = hash64(field, size);
955 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
969 osize = offsetof(Object, field.payload) + size;
970 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
974 o->field.hash = htole64(hash);
975 memcpy(o->field.payload, field, size);
977 r = journal_file_link_field(f, o, p, hash);
981 /* The linking might have altered the window, so let's
982 * refresh our pointer */
983 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
988 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1002 static int journal_file_append_data(
1004 const void *data, uint64_t size,
1005 Object **ret, uint64_t *offset) {
1010 int r, compression = 0;
1014 assert(data || size == 0);
1016 hash = hash64(data, size);
1018 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1032 osize = offsetof(Object, data.payload) + size;
1033 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1037 o->data.hash = htole64(hash);
1039 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
1040 if (f->compress_xz &&
1041 size >= COMPRESSION_SIZE_THRESHOLD) {
1044 compression = compress_blob(data, size, o->data.payload, &rsize);
1047 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1048 o->object.flags |= compression;
1050 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1051 size, rsize, object_compressed_to_string(compression));
1056 if (!compression && size > 0)
1057 memcpy(o->data.payload, data, size);
1059 r = journal_file_link_data(f, o, p, hash);
1063 /* The linking might have altered the window, so let's
1064 * refresh our pointer */
1065 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1072 eq = memchr(data, '=', size);
1073 if (eq && eq > data) {
1077 /* Create field object ... */
1078 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1082 /* ... and link it in. */
1083 o->data.next_field_offset = fo->field.head_data_offset;
1084 fo->field.head_data_offset = le64toh(p);
1088 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1102 uint64_t journal_file_entry_n_items(Object *o) {
1105 if (o->object.type != OBJECT_ENTRY)
1108 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1111 uint64_t journal_file_entry_array_n_items(Object *o) {
1114 if (o->object.type != OBJECT_ENTRY_ARRAY)
1117 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1120 uint64_t journal_file_hash_table_n_items(Object *o) {
1123 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1124 o->object.type != OBJECT_FIELD_HASH_TABLE)
1127 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1130 static int link_entry_into_array(JournalFile *f,
1135 uint64_t n = 0, ap = 0, q, i, a, hidx;
1143 a = le64toh(*first);
1144 i = hidx = le64toh(*idx);
1147 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1151 n = journal_file_entry_array_n_items(o);
1153 o->entry_array.items[i] = htole64(p);
1154 *idx = htole64(hidx + 1);
1160 a = le64toh(o->entry_array.next_entry_array_offset);
1171 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1172 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1178 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1183 o->entry_array.items[i] = htole64(p);
1186 *first = htole64(q);
1188 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1192 o->entry_array.next_entry_array_offset = htole64(q);
1195 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1196 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1198 *idx = htole64(hidx + 1);
1203 static int link_entry_into_array_plus_one(JournalFile *f,
1218 *extra = htole64(p);
1222 i = htole64(le64toh(*idx) - 1);
1223 r = link_entry_into_array(f, first, &i, p);
1228 *idx = htole64(le64toh(*idx) + 1);
1232 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1239 p = le64toh(o->entry.items[i].object_offset);
1243 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1247 return link_entry_into_array_plus_one(f,
1248 &o->data.entry_offset,
1249 &o->data.entry_array_offset,
1254 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1262 if (o->object.type != OBJECT_ENTRY)
1265 __sync_synchronize();
1267 /* Link up the entry itself */
1268 r = link_entry_into_array(f,
1269 &f->header->entry_array_offset,
1270 &f->header->n_entries,
1275 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1277 if (f->header->head_entry_realtime == 0)
1278 f->header->head_entry_realtime = o->entry.realtime;
1280 f->header->tail_entry_realtime = o->entry.realtime;
1281 f->header->tail_entry_monotonic = o->entry.monotonic;
1283 f->tail_entry_monotonic_valid = true;
1285 /* Link up the items */
1286 n = journal_file_entry_n_items(o);
1287 for (i = 0; i < n; i++) {
1288 r = journal_file_link_entry_item(f, o, offset, i);
1296 static int journal_file_append_entry_internal(
1298 const dual_timestamp *ts,
1300 const EntryItem items[], unsigned n_items,
1302 Object **ret, uint64_t *offset) {
1309 assert(items || n_items == 0);
1312 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1314 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1318 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1319 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1320 o->entry.realtime = htole64(ts->realtime);
1321 o->entry.monotonic = htole64(ts->monotonic);
1322 o->entry.xor_hash = htole64(xor_hash);
1323 o->entry.boot_id = f->header->boot_id;
1326 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1331 r = journal_file_link_entry(f, o, np);
1344 void journal_file_post_change(JournalFile *f) {
1347 /* inotify() does not receive IN_MODIFY events from file
1348 * accesses done via mmap(). After each access we hence
1349 * trigger IN_MODIFY by truncating the journal file to its
1350 * current size which triggers IN_MODIFY. */
1352 __sync_synchronize();
1354 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1355 log_error_errno(errno, "Failed to truncate file to its own size: %m");
1358 static int entry_item_cmp(const void *_a, const void *_b) {
1359 const EntryItem *a = _a, *b = _b;
1361 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1363 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1368 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1372 uint64_t xor_hash = 0;
1373 struct dual_timestamp _ts;
1376 assert(iovec || n_iovec == 0);
1379 dual_timestamp_get(&_ts);
1383 if (f->tail_entry_monotonic_valid &&
1384 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1388 r = journal_file_maybe_append_tag(f, ts->realtime);
1393 /* alloca() can't take 0, hence let's allocate at least one */
1394 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1396 for (i = 0; i < n_iovec; i++) {
1400 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1404 xor_hash ^= le64toh(o->data.hash);
1405 items[i].object_offset = htole64(p);
1406 items[i].hash = o->data.hash;
1409 /* Order by the position on disk, in order to improve seek
1410 * times for rotating media. */
1411 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1413 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1415 /* If the memory mapping triggered a SIGBUS then we return an
1416 * IO error and ignore the error code passed down to us, since
1417 * it is very likely just an effect of a nullified replacement
1420 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1423 journal_file_post_change(f);
1428 typedef struct ChainCacheItem {
1429 uint64_t first; /* the array at the beginning of the chain */
1430 uint64_t array; /* the cached array */
1431 uint64_t begin; /* the first item in the cached array */
1432 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1433 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1436 static void chain_cache_put(
1443 uint64_t last_index) {
1446 /* If the chain item to cache for this chain is the
1447 * first one it's not worth caching anything */
1451 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
1452 ci = ordered_hashmap_steal_first(h);
1455 ci = new(ChainCacheItem, 1);
1462 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
1467 assert(ci->first == first);
1472 ci->last_index = last_index;
1475 static int generic_array_get(
1479 Object **ret, uint64_t *offset) {
1482 uint64_t p = 0, a, t = 0;
1490 /* Try the chain cache first */
1491 ci = ordered_hashmap_get(f->chain_cache, &first);
1492 if (ci && i > ci->total) {
1501 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1505 k = journal_file_entry_array_n_items(o);
1507 p = le64toh(o->entry_array.items[i]);
1513 a = le64toh(o->entry_array.next_entry_array_offset);
1519 /* Let's cache this item for the next invocation */
1520 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1522 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1535 static int generic_array_get_plus_one(
1540 Object **ret, uint64_t *offset) {
1549 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1562 return generic_array_get(f, first, i-1, ret, offset);
1571 static int generic_array_bisect(
1576 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1577 direction_t direction,
1582 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1583 bool subtract_one = false;
1584 Object *o, *array = NULL;
1589 assert(test_object);
1591 /* Start with the first array in the chain */
1594 ci = ordered_hashmap_get(f->chain_cache, &first);
1595 if (ci && n > ci->total) {
1596 /* Ah, we have iterated this bisection array chain
1597 * previously! Let's see if we can skip ahead in the
1598 * chain, as far as the last time. But we can't jump
1599 * backwards in the chain, so let's check that
1602 r = test_object(f, ci->begin, needle);
1606 if (r == TEST_LEFT) {
1607 /* OK, what we are looking for is right of the
1608 * begin of this EntryArray, so let's jump
1609 * straight to previously cached array in the
1615 last_index = ci->last_index;
1620 uint64_t left, right, k, lp;
1622 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1626 k = journal_file_entry_array_n_items(array);
1632 lp = p = le64toh(array->entry_array.items[i]);
1636 r = test_object(f, p, needle);
1640 if (r == TEST_FOUND)
1641 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1643 if (r == TEST_RIGHT) {
1647 if (last_index != (uint64_t) -1) {
1648 assert(last_index <= right);
1650 /* If we cached the last index we
1651 * looked at, let's try to not to jump
1652 * too wildly around and see if we can
1653 * limit the range to look at early to
1654 * the immediate neighbors of the last
1655 * index we looked at. */
1657 if (last_index > 0) {
1658 uint64_t x = last_index - 1;
1660 p = le64toh(array->entry_array.items[x]);
1664 r = test_object(f, p, needle);
1668 if (r == TEST_FOUND)
1669 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1671 if (r == TEST_RIGHT)
1677 if (last_index < right) {
1678 uint64_t y = last_index + 1;
1680 p = le64toh(array->entry_array.items[y]);
1684 r = test_object(f, p, needle);
1688 if (r == TEST_FOUND)
1689 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1691 if (r == TEST_RIGHT)
1699 if (left == right) {
1700 if (direction == DIRECTION_UP)
1701 subtract_one = true;
1707 assert(left < right);
1708 i = (left + right) / 2;
1710 p = le64toh(array->entry_array.items[i]);
1714 r = test_object(f, p, needle);
1718 if (r == TEST_FOUND)
1719 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1721 if (r == TEST_RIGHT)
1729 if (direction == DIRECTION_UP) {
1731 subtract_one = true;
1742 last_index = (uint64_t) -1;
1743 a = le64toh(array->entry_array.next_entry_array_offset);
1749 if (subtract_one && t == 0 && i == 0)
1752 /* Let's cache this item for the next invocation */
1753 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1755 if (subtract_one && i == 0)
1757 else if (subtract_one)
1758 p = le64toh(array->entry_array.items[i-1]);
1760 p = le64toh(array->entry_array.items[i]);
1762 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1773 *idx = t + i + (subtract_one ? -1 : 0);
1778 static int generic_array_bisect_plus_one(
1784 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1785 direction_t direction,
1791 bool step_back = false;
1795 assert(test_object);
1800 /* This bisects the array in object 'first', but first checks
1802 r = test_object(f, extra, needle);
1806 if (r == TEST_FOUND)
1807 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1809 /* if we are looking with DIRECTION_UP then we need to first
1810 see if in the actual array there is a matching entry, and
1811 return the last one of that. But if there isn't any we need
1812 to return this one. Hence remember this, and return it
1815 step_back = direction == DIRECTION_UP;
1817 if (r == TEST_RIGHT) {
1818 if (direction == DIRECTION_DOWN)
1824 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1826 if (r == 0 && step_back)
1835 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1851 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1857 else if (p < needle)
1863 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1870 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1874 if (le64toh(o->entry.seqnum) == needle)
1876 else if (le64toh(o->entry.seqnum) < needle)
1882 int journal_file_move_to_entry_by_seqnum(
1885 direction_t direction,
1889 return generic_array_bisect(f,
1890 le64toh(f->header->entry_array_offset),
1891 le64toh(f->header->n_entries),
1898 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1905 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1909 if (le64toh(o->entry.realtime) == needle)
1911 else if (le64toh(o->entry.realtime) < needle)
1917 int journal_file_move_to_entry_by_realtime(
1920 direction_t direction,
1924 return generic_array_bisect(f,
1925 le64toh(f->header->entry_array_offset),
1926 le64toh(f->header->n_entries),
1928 test_object_realtime,
1933 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1940 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1944 if (le64toh(o->entry.monotonic) == needle)
1946 else if (le64toh(o->entry.monotonic) < needle)
1952 static inline int find_data_object_by_boot_id(
1957 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1959 sd_id128_to_string(boot_id, t + 9);
1960 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1963 int journal_file_move_to_entry_by_monotonic(
1967 direction_t direction,
1976 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1982 return generic_array_bisect_plus_one(f,
1983 le64toh(o->data.entry_offset),
1984 le64toh(o->data.entry_array_offset),
1985 le64toh(o->data.n_entries),
1987 test_object_monotonic,
1992 void journal_file_reset_location(JournalFile *f) {
1993 f->location_type = LOCATION_HEAD;
1994 f->current_offset = 0;
1995 f->current_seqnum = 0;
1996 f->current_realtime = 0;
1997 f->current_monotonic = 0;
1998 zero(f->current_boot_id);
1999 f->current_xor_hash = 0;
2002 void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2003 f->last_direction = direction;
2004 f->location_type = LOCATION_SEEK;
2005 f->current_offset = offset;
2006 f->current_seqnum = le64toh(o->entry.seqnum);
2007 f->current_realtime = le64toh(o->entry.realtime);
2008 f->current_monotonic = le64toh(o->entry.monotonic);
2009 f->current_boot_id = o->entry.boot_id;
2010 f->current_xor_hash = le64toh(o->entry.xor_hash);
2013 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2016 assert(af->location_type == LOCATION_SEEK);
2017 assert(bf->location_type == LOCATION_SEEK);
2019 /* If contents and timestamps match, these entries are
2020 * identical, even if the seqnum does not match */
2021 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2022 af->current_monotonic == bf->current_monotonic &&
2023 af->current_realtime == bf->current_realtime &&
2024 af->current_xor_hash == bf->current_xor_hash)
2027 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2029 /* If this is from the same seqnum source, compare
2031 if (af->current_seqnum < bf->current_seqnum)
2033 if (af->current_seqnum > bf->current_seqnum)
2036 /* Wow! This is weird, different data but the same
2037 * seqnums? Something is borked, but let's make the
2038 * best of it and compare by time. */
2041 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2043 /* If the boot id matches, compare monotonic time */
2044 if (af->current_monotonic < bf->current_monotonic)
2046 if (af->current_monotonic > bf->current_monotonic)
2050 /* Otherwise, compare UTC time */
2051 if (af->current_realtime < bf->current_realtime)
2053 if (af->current_realtime > bf->current_realtime)
2056 /* Finally, compare by contents */
2057 if (af->current_xor_hash < bf->current_xor_hash)
2059 if (af->current_xor_hash > bf->current_xor_hash)
2065 int journal_file_next_entry(
2068 direction_t direction,
2069 Object **ret, uint64_t *offset) {
2076 n = le64toh(f->header->n_entries);
2081 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2083 r = generic_array_bisect(f,
2084 le64toh(f->header->entry_array_offset),
2085 le64toh(f->header->n_entries),
2094 if (direction == DIRECTION_DOWN) {
2107 /* And jump to it */
2108 r = generic_array_get(f,
2109 le64toh(f->header->entry_array_offset),
2116 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2117 log_debug("%s: entry array corrupted at entry %"PRIu64,
2128 int journal_file_next_entry_for_data(
2130 Object *o, uint64_t p,
2131 uint64_t data_offset,
2132 direction_t direction,
2133 Object **ret, uint64_t *offset) {
2140 assert(p > 0 || !o);
2142 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2146 n = le64toh(d->data.n_entries);
2151 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2153 if (o->object.type != OBJECT_ENTRY)
2156 r = generic_array_bisect_plus_one(f,
2157 le64toh(d->data.entry_offset),
2158 le64toh(d->data.entry_array_offset),
2159 le64toh(d->data.n_entries),
2169 if (direction == DIRECTION_DOWN) {
2183 return generic_array_get_plus_one(f,
2184 le64toh(d->data.entry_offset),
2185 le64toh(d->data.entry_array_offset),
2190 int journal_file_move_to_entry_by_offset_for_data(
2192 uint64_t data_offset,
2194 direction_t direction,
2195 Object **ret, uint64_t *offset) {
2202 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2206 return generic_array_bisect_plus_one(f,
2207 le64toh(d->data.entry_offset),
2208 le64toh(d->data.entry_array_offset),
2209 le64toh(d->data.n_entries),
2216 int journal_file_move_to_entry_by_monotonic_for_data(
2218 uint64_t data_offset,
2221 direction_t direction,
2222 Object **ret, uint64_t *offset) {
2230 /* First, seek by time */
2231 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2237 r = generic_array_bisect_plus_one(f,
2238 le64toh(o->data.entry_offset),
2239 le64toh(o->data.entry_array_offset),
2240 le64toh(o->data.n_entries),
2242 test_object_monotonic,
2248 /* And now, continue seeking until we find an entry that
2249 * exists in both bisection arrays */
2255 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2259 r = generic_array_bisect_plus_one(f,
2260 le64toh(d->data.entry_offset),
2261 le64toh(d->data.entry_array_offset),
2262 le64toh(d->data.n_entries),
2270 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2274 r = generic_array_bisect_plus_one(f,
2275 le64toh(o->data.entry_offset),
2276 le64toh(o->data.entry_array_offset),
2277 le64toh(o->data.n_entries),
2299 int journal_file_move_to_entry_by_seqnum_for_data(
2301 uint64_t data_offset,
2303 direction_t direction,
2304 Object **ret, uint64_t *offset) {
2311 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2315 return generic_array_bisect_plus_one(f,
2316 le64toh(d->data.entry_offset),
2317 le64toh(d->data.entry_array_offset),
2318 le64toh(d->data.n_entries),
2325 int journal_file_move_to_entry_by_realtime_for_data(
2327 uint64_t data_offset,
2329 direction_t direction,
2330 Object **ret, uint64_t *offset) {
2337 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2341 return generic_array_bisect_plus_one(f,
2342 le64toh(d->data.entry_offset),
2343 le64toh(d->data.entry_array_offset),
2344 le64toh(d->data.n_entries),
2346 test_object_realtime,
2351 void journal_file_dump(JournalFile *f) {
2358 journal_file_print_header(f);
2360 p = le64toh(f->header->header_size);
2362 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
2366 switch (o->object.type) {
2369 printf("Type: OBJECT_UNUSED\n");
2373 printf("Type: OBJECT_DATA\n");
2377 printf("Type: OBJECT_FIELD\n");
2381 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2382 le64toh(o->entry.seqnum),
2383 le64toh(o->entry.monotonic),
2384 le64toh(o->entry.realtime));
2387 case OBJECT_FIELD_HASH_TABLE:
2388 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2391 case OBJECT_DATA_HASH_TABLE:
2392 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2395 case OBJECT_ENTRY_ARRAY:
2396 printf("Type: OBJECT_ENTRY_ARRAY\n");
2400 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2401 le64toh(o->tag.seqnum),
2402 le64toh(o->tag.epoch));
2406 printf("Type: unknown (%u)\n", o->object.type);
2410 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2411 printf("Flags: %s\n",
2412 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2414 if (p == le64toh(f->header->tail_object_offset))
2417 p = p + ALIGN64(le64toh(o->object.size));
2422 log_error("File corrupt");
2425 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2428 x = format_timestamp(buf, l, t);
2434 void journal_file_print_header(JournalFile *f) {
2435 char a[33], b[33], c[33], d[33];
2436 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2438 char bytes[FORMAT_BYTES_MAX];
2442 printf("File Path: %s\n"
2446 "Sequential Number ID: %s\n"
2448 "Compatible Flags:%s%s\n"
2449 "Incompatible Flags:%s%s%s\n"
2450 "Header size: %"PRIu64"\n"
2451 "Arena size: %"PRIu64"\n"
2452 "Data Hash Table Size: %"PRIu64"\n"
2453 "Field Hash Table Size: %"PRIu64"\n"
2454 "Rotate Suggested: %s\n"
2455 "Head Sequential Number: %"PRIu64"\n"
2456 "Tail Sequential Number: %"PRIu64"\n"
2457 "Head Realtime Timestamp: %s\n"
2458 "Tail Realtime Timestamp: %s\n"
2459 "Tail Monotonic Timestamp: %s\n"
2460 "Objects: %"PRIu64"\n"
2461 "Entry Objects: %"PRIu64"\n",
2463 sd_id128_to_string(f->header->file_id, a),
2464 sd_id128_to_string(f->header->machine_id, b),
2465 sd_id128_to_string(f->header->boot_id, c),
2466 sd_id128_to_string(f->header->seqnum_id, d),
2467 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2468 f->header->state == STATE_ONLINE ? "ONLINE" :
2469 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2470 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2471 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2472 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2473 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2474 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2475 le64toh(f->header->header_size),
2476 le64toh(f->header->arena_size),
2477 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2478 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2479 yes_no(journal_file_rotate_suggested(f, 0)),
2480 le64toh(f->header->head_entry_seqnum),
2481 le64toh(f->header->tail_entry_seqnum),
2482 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2483 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2484 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2485 le64toh(f->header->n_objects),
2486 le64toh(f->header->n_entries));
2488 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2489 printf("Data Objects: %"PRIu64"\n"
2490 "Data Hash Table Fill: %.1f%%\n",
2491 le64toh(f->header->n_data),
2492 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2494 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2495 printf("Field Objects: %"PRIu64"\n"
2496 "Field Hash Table Fill: %.1f%%\n",
2497 le64toh(f->header->n_fields),
2498 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2500 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2501 printf("Tag Objects: %"PRIu64"\n",
2502 le64toh(f->header->n_tags));
2503 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2504 printf("Entry Array Objects: %"PRIu64"\n",
2505 le64toh(f->header->n_entry_arrays));
2507 if (fstat(f->fd, &st) >= 0)
2508 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2511 int journal_file_open(
2517 JournalMetrics *metrics,
2518 MMapCache *mmap_cache,
2519 JournalFile *template,
2520 JournalFile **ret) {
2522 bool newly_created = false;
2530 if ((flags & O_ACCMODE) != O_RDONLY &&
2531 (flags & O_ACCMODE) != O_RDWR)
2534 if (!endswith(fname, ".journal") &&
2535 !endswith(fname, ".journal~"))
2538 f = new0(JournalFile, 1);
2546 f->prot = prot_from_flags(flags);
2547 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2548 #if defined(HAVE_LZ4)
2549 f->compress_lz4 = compress;
2550 #elif defined(HAVE_XZ)
2551 f->compress_xz = compress;
2558 f->mmap = mmap_cache_ref(mmap_cache);
2560 f->mmap = mmap_cache_new();
2567 f->path = strdup(fname);
2573 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
2574 if (!f->chain_cache) {
2579 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2585 r = journal_file_fstat(f);
2589 if (f->last_stat.st_size == 0 && f->writable) {
2590 /* Let's attach the creation time to the journal file,
2591 * so that the vacuuming code knows the age of this
2592 * file even if the file might end up corrupted one
2593 * day... Ideally we'd just use the creation time many
2594 * file systems maintain for each file, but there is
2595 * currently no usable API to query this, hence let's
2596 * emulate this via extended attributes. If extended
2597 * attributes are not supported we'll just skip this,
2598 * and rely solely on mtime/atime/ctime of the file. */
2600 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
2603 /* Try to load the FSPRG state, and if we can't, then
2604 * just don't do sealing */
2606 r = journal_file_fss_load(f);
2612 r = journal_file_init_header(f, template);
2616 r = journal_file_fstat(f);
2620 newly_created = true;
2623 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2628 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2636 if (!newly_created) {
2637 r = journal_file_verify_header(f);
2643 if (!newly_created && f->writable) {
2644 r = journal_file_fss_load(f);
2652 journal_default_metrics(metrics, f->fd);
2653 f->metrics = *metrics;
2654 } else if (template)
2655 f->metrics = template->metrics;
2657 r = journal_file_refresh_header(f);
2663 r = journal_file_hmac_setup(f);
2668 if (newly_created) {
2669 r = journal_file_setup_field_hash_table(f);
2673 r = journal_file_setup_data_hash_table(f);
2678 r = journal_file_append_first_tag(f);
2684 r = journal_file_map_field_hash_table(f);
2688 r = journal_file_map_data_hash_table(f);
2692 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2701 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2704 journal_file_close(f);
2709 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2710 _cleanup_free_ char *p = NULL;
2712 JournalFile *old_file, *new_file = NULL;
2720 if (!old_file->writable)
2723 if (!endswith(old_file->path, ".journal"))
2726 l = strlen(old_file->path);
2727 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2728 (int) l - 8, old_file->path,
2729 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2730 le64toh((*f)->header->head_entry_seqnum),
2731 le64toh((*f)->header->head_entry_realtime));
2735 /* Try to rename the file to the archived version. If the file
2736 * already was deleted, we'll get ENOENT, let's ignore that
2738 r = rename(old_file->path, p);
2739 if (r < 0 && errno != ENOENT)
2742 old_file->header->state = STATE_ARCHIVED;
2744 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2745 journal_file_close(old_file);
2751 int journal_file_open_reliably(
2757 JournalMetrics *metrics,
2758 MMapCache *mmap_cache,
2759 JournalFile *template,
2760 JournalFile **ret) {
2764 _cleanup_free_ char *p = NULL;
2766 r = journal_file_open(fname, flags, mode, compress, seal,
2767 metrics, mmap_cache, template, ret);
2768 if (r != -EBADMSG && /* corrupted */
2769 r != -ENODATA && /* truncated */
2770 r != -EHOSTDOWN && /* other machine */
2771 r != -EPROTONOSUPPORT && /* incompatible feature */
2772 r != -EBUSY && /* unclean shutdown */
2773 r != -ESHUTDOWN && /* already archived */
2774 r != -EIO /* IO error, including SIGBUS on mmap */)
2777 if ((flags & O_ACCMODE) == O_RDONLY)
2780 if (!(flags & O_CREAT))
2783 if (!endswith(fname, ".journal"))
2786 /* The file is corrupted. Rotate it away and try it again (but only once) */
2789 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2791 (unsigned long long) now(CLOCK_REALTIME),
2795 r = rename(fname, p);
2799 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2801 return journal_file_open(fname, flags, mode, compress, seal,
2802 metrics, mmap_cache, template, ret);
2805 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2807 uint64_t q, xor_hash = 0;
2820 ts.monotonic = le64toh(o->entry.monotonic);
2821 ts.realtime = le64toh(o->entry.realtime);
2823 n = journal_file_entry_n_items(o);
2824 /* alloca() can't take 0, hence let's allocate at least one */
2825 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2827 for (i = 0; i < n; i++) {
2834 q = le64toh(o->entry.items[i].object_offset);
2835 le_hash = o->entry.items[i].hash;
2837 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2841 if (le_hash != o->data.hash)
2844 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2847 /* We hit the limit on 32bit machines */
2848 if ((uint64_t) t != l)
2851 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2852 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2855 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2856 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2860 data = from->compress_buffer;
2863 return -EPROTONOSUPPORT;
2866 data = o->data.payload;
2868 r = journal_file_append_data(to, data, l, &u, &h);
2872 xor_hash ^= le64toh(u->data.hash);
2873 items[i].object_offset = htole64(h);
2874 items[i].hash = u->data.hash;
2876 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2881 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2883 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2889 void journal_default_metrics(JournalMetrics *m, int fd) {
2890 uint64_t fs_size = 0;
2892 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2897 if (fstatvfs(fd, &ss) >= 0)
2898 fs_size = ss.f_frsize * ss.f_blocks;
2900 if (m->max_use == (uint64_t) -1) {
2903 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2905 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2906 m->max_use = DEFAULT_MAX_USE_UPPER;
2908 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2909 m->max_use = DEFAULT_MAX_USE_LOWER;
2911 m->max_use = DEFAULT_MAX_USE_LOWER;
2913 m->max_use = PAGE_ALIGN(m->max_use);
2915 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2916 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2919 if (m->max_size == (uint64_t) -1) {
2920 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2922 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2923 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2925 m->max_size = PAGE_ALIGN(m->max_size);
2927 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2928 m->max_size = JOURNAL_FILE_SIZE_MIN;
2930 if (m->max_size*2 > m->max_use)
2931 m->max_use = m->max_size*2;
2933 if (m->min_size == (uint64_t) -1)
2934 m->min_size = JOURNAL_FILE_SIZE_MIN;
2936 m->min_size = PAGE_ALIGN(m->min_size);
2938 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2939 m->min_size = JOURNAL_FILE_SIZE_MIN;
2941 if (m->min_size > m->max_size)
2942 m->max_size = m->min_size;
2945 if (m->keep_free == (uint64_t) -1) {
2948 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2950 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2951 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2954 m->keep_free = DEFAULT_KEEP_FREE;
2957 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2958 format_bytes(a, sizeof(a), m->max_use),
2959 format_bytes(b, sizeof(b), m->max_size),
2960 format_bytes(c, sizeof(c), m->min_size),
2961 format_bytes(d, sizeof(d), m->keep_free));
2964 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2969 if (f->header->head_entry_realtime == 0)
2972 *from = le64toh(f->header->head_entry_realtime);
2976 if (f->header->tail_entry_realtime == 0)
2979 *to = le64toh(f->header->tail_entry_realtime);
2985 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2993 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2997 if (le64toh(o->data.n_entries) <= 0)
3001 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3005 *from = le64toh(o->entry.monotonic);
3009 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3013 r = generic_array_get_plus_one(f,
3014 le64toh(o->data.entry_offset),
3015 le64toh(o->data.entry_array_offset),
3016 le64toh(o->data.n_entries)-1,
3021 *to = le64toh(o->entry.monotonic);
3027 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
3030 /* If we gained new header fields we gained new features,
3031 * hence suggest a rotation */
3032 if (le64toh(f->header->header_size) < sizeof(Header)) {
3033 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
3037 /* Let's check if the hash tables grew over a certain fill
3038 * level (75%, borrowing this value from Java's hash table
3039 * implementation), and if so suggest a rotation. To calculate
3040 * the fill level we need the n_data field, which only exists
3041 * in newer versions. */
3043 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3044 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3045 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3047 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3048 le64toh(f->header->n_data),
3049 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3050 (unsigned long long) f->last_stat.st_size,
3051 f->last_stat.st_size / le64toh(f->header->n_data));
3055 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3056 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3057 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3059 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3060 le64toh(f->header->n_fields),
3061 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3065 /* Are the data objects properly indexed by field objects? */
3066 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3067 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3068 le64toh(f->header->n_data) > 0 &&
3069 le64toh(f->header->n_fields) == 0)
3072 if (max_file_usec > 0) {
3075 h = le64toh(f->header->head_entry_realtime);
3076 t = now(CLOCK_REALTIME);
3078 if (h > 0 && t > h + max_file_usec)