1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
29 #include <sys/xattr.h>
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 static int journal_file_set_online(JournalFile *f) {
77 if (!(f->fd >= 0 && f->header))
80 switch(f->header->state) {
85 f->header->state = STATE_ONLINE;
94 int journal_file_set_offline(JournalFile *f) {
100 if (!(f->fd >= 0 && f->header))
103 if (f->header->state != STATE_ONLINE)
108 f->header->state = STATE_OFFLINE;
115 void journal_file_close(JournalFile *f) {
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
128 journal_file_set_offline(f);
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 mmap_cache_unref(f->mmap);
139 hashmap_free_free(f->chain_cache);
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142 free(f->compress_buffer);
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
154 gcry_md_close(f->hmac);
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
167 memcpy(h.signature, HEADER_SIGNATURE, 8);
168 h.header_size = htole64(ALIGN64(sizeof(h)));
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
177 r = sd_id128_randomize(&h.file_id);
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
185 h.seqnum_id = h.file_id;
187 k = pwrite(f->fd, &h, sizeof(h), 0);
197 static int journal_file_refresh_header(JournalFile *f) {
203 r = sd_id128_get_machine(&f->header->machine_id);
207 r = sd_id128_get_boot(&boot_id);
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
214 f->header->boot_id = boot_id;
216 journal_file_set_online(f);
218 /* Sync the online state to disk */
224 static int journal_file_verify_header(JournalFile *f) {
229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
243 return -EPROTONOSUPPORT;
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
260 if (f->header->state >= _STATE_MAX)
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
284 sd_id128_t machine_id;
287 r = sd_id128_get_machine(&machine_id);
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
294 state = f->header->state;
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
299 } else if (state == STATE_ARCHIVED)
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
333 if (new_size <= old_size)
336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342 if (fstatvfs(f->fd, &svfs) >= 0) {
345 available = svfs.f_bfree * svfs.f_bsize;
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
352 if (new_size - old_size > available)
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
369 if (fstat(f->fd, &f->last_stat) < 0)
372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
377 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
384 /* Avoid SIGBUS on invalid accesses */
385 if (offset + size > (uint64_t) f->last_stat.st_size) {
386 /* Hmm, out of range? Let's refresh the fstat() data
387 * first, before we trust that check. */
389 if (fstat(f->fd, &f->last_stat) < 0 ||
390 offset + size > (uint64_t) f->last_stat.st_size)
391 return -EADDRNOTAVAIL;
394 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 static uint64_t minimum_header_size(Object *o) {
399 static const uint64_t table[] = {
400 [OBJECT_DATA] = sizeof(DataObject),
401 [OBJECT_FIELD] = sizeof(FieldObject),
402 [OBJECT_ENTRY] = sizeof(EntryObject),
403 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406 [OBJECT_TAG] = sizeof(TagObject),
409 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410 return sizeof(ObjectHeader);
412 return table[o->object.type];
415 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
424 /* Objects may only be located at multiple of 64 bit */
425 if (!VALID64(offset))
429 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
434 s = le64toh(o->object.size);
436 if (s < sizeof(ObjectHeader))
439 if (o->object.type <= OBJECT_UNUSED)
442 if (s < minimum_header_size(o))
445 if (type > 0 && o->object.type != type)
448 if (s > sizeof(ObjectHeader)) {
449 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
465 r = le64toh(f->header->tail_entry_seqnum) + 1;
468 /* If an external seqnum counter was passed, we update
469 * both the local and the external one, and set it to
470 * the maximum of both */
478 f->header->tail_entry_seqnum = htole64(r);
480 if (f->header->head_entry_seqnum == 0)
481 f->header->head_entry_seqnum = htole64(r);
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
493 assert(type > 0 && type < _OBJECT_TYPE_MAX);
494 assert(size >= sizeof(ObjectHeader));
498 r = journal_file_set_online(f);
502 p = le64toh(f->header->tail_object_offset);
504 p = le64toh(f->header->header_size);
506 r = journal_file_move_to_object(f, -1, p, &tail);
510 p += ALIGN64(le64toh(tail->object.size));
513 r = journal_file_allocate(f, p, size);
517 r = journal_file_move_to(f, type, false, p, size, &t);
524 o->object.type = type;
525 o->object.size = htole64(size);
527 f->header->tail_object_offset = htole64(p);
528 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
543 /* We estimate that we need 1 hash table entry per 768 of
544 journal file and we want to make sure we never get beyond
545 75% fill level. Calculate the hash table size for the
546 maximum file size based on these metrics. */
548 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550 s = DEFAULT_DATA_HASH_TABLE_SIZE;
552 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
554 r = journal_file_append_object(f,
555 OBJECT_DATA_HASH_TABLE,
556 offsetof(Object, hash_table.items) + s,
561 memzero(o->hash_table.items, s);
563 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564 f->header->data_hash_table_size = htole64(s);
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
576 /* We use a fixed size hash table for the fields as this
577 * number should grow very slowly only */
579 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580 r = journal_file_append_object(f,
581 OBJECT_FIELD_HASH_TABLE,
582 offsetof(Object, hash_table.items) + s,
587 memzero(o->hash_table.items, s);
589 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590 f->header->field_hash_table_size = htole64(s);
595 static int journal_file_map_data_hash_table(JournalFile *f) {
602 p = le64toh(f->header->data_hash_table_offset);
603 s = le64toh(f->header->data_hash_table_size);
605 r = journal_file_move_to(f,
606 OBJECT_DATA_HASH_TABLE,
613 f->data_hash_table = t;
617 static int journal_file_map_field_hash_table(JournalFile *f) {
624 p = le64toh(f->header->field_hash_table_offset);
625 s = le64toh(f->header->field_hash_table_size);
627 r = journal_file_move_to(f,
628 OBJECT_FIELD_HASH_TABLE,
635 f->field_hash_table = t;
639 static int journal_file_link_field(
652 if (o->object.type != OBJECT_FIELD)
655 /* This might alter the window we are looking at */
657 o->field.next_hash_offset = o->field.head_data_offset = 0;
659 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660 p = le64toh(f->field_hash_table[h].tail_hash_offset);
662 f->field_hash_table[h].head_hash_offset = htole64(offset);
664 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
668 o->field.next_hash_offset = htole64(offset);
671 f->field_hash_table[h].tail_hash_offset = htole64(offset);
673 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
679 static int journal_file_link_data(
692 if (o->object.type != OBJECT_DATA)
695 /* This might alter the window we are looking at */
697 o->data.next_hash_offset = o->data.next_field_offset = 0;
698 o->data.entry_offset = o->data.entry_array_offset = 0;
699 o->data.n_entries = 0;
701 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702 p = le64toh(f->data_hash_table[h].tail_hash_offset);
704 /* Only entry in the hash table is easy */
705 f->data_hash_table[h].head_hash_offset = htole64(offset);
707 /* Move back to the previous data object, to patch in
710 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
714 o->data.next_hash_offset = htole64(offset);
717 f->data_hash_table[h].tail_hash_offset = htole64(offset);
719 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
725 int journal_file_find_field_object_with_hash(
727 const void *field, uint64_t size, uint64_t hash,
728 Object **ret, uint64_t *offset) {
730 uint64_t p, osize, h;
734 assert(field && size > 0);
736 osize = offsetof(Object, field.payload) + size;
738 if (f->header->field_hash_table_size == 0)
741 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742 p = le64toh(f->field_hash_table[h].head_hash_offset);
747 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
751 if (le64toh(o->field.hash) == hash &&
752 le64toh(o->object.size) == osize &&
753 memcmp(o->field.payload, field, size) == 0) {
763 p = le64toh(o->field.next_hash_offset);
769 int journal_file_find_field_object(
771 const void *field, uint64_t size,
772 Object **ret, uint64_t *offset) {
777 assert(field && size > 0);
779 hash = hash64(field, size);
781 return journal_file_find_field_object_with_hash(f,
786 int journal_file_find_data_object_with_hash(
788 const void *data, uint64_t size, uint64_t hash,
789 Object **ret, uint64_t *offset) {
791 uint64_t p, osize, h;
795 assert(data || size == 0);
797 osize = offsetof(Object, data.payload) + size;
799 if (f->header->data_hash_table_size == 0)
802 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803 p = le64toh(f->data_hash_table[h].head_hash_offset);
808 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
812 if (le64toh(o->data.hash) != hash)
815 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
816 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
820 l = le64toh(o->object.size);
821 if (l <= offsetof(Object, data.payload))
824 l -= offsetof(Object, data.payload);
826 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
827 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
832 memcmp(f->compress_buffer, data, size) == 0) {
843 return -EPROTONOSUPPORT;
845 } else if (le64toh(o->object.size) == osize &&
846 memcmp(o->data.payload, data, size) == 0) {
858 p = le64toh(o->data.next_hash_offset);
864 int journal_file_find_data_object(
866 const void *data, uint64_t size,
867 Object **ret, uint64_t *offset) {
872 assert(data || size == 0);
874 hash = hash64(data, size);
876 return journal_file_find_data_object_with_hash(f,
881 static int journal_file_append_field(
883 const void *field, uint64_t size,
884 Object **ret, uint64_t *offset) {
892 assert(field && size > 0);
894 hash = hash64(field, size);
896 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
910 osize = offsetof(Object, field.payload) + size;
911 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
915 o->field.hash = htole64(hash);
916 memcpy(o->field.payload, field, size);
918 r = journal_file_link_field(f, o, p, hash);
922 /* The linking might have altered the window, so let's
923 * refresh our pointer */
924 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
929 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
943 static int journal_file_append_data(
945 const void *data, uint64_t size,
946 Object **ret, uint64_t *offset) {
951 int r, compression = 0;
955 assert(data || size == 0);
957 hash = hash64(data, size);
959 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
973 osize = offsetof(Object, data.payload) + size;
974 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
978 o->data.hash = htole64(hash);
980 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
981 if (f->compress_xz &&
982 size >= COMPRESSION_SIZE_THRESHOLD) {
985 compression = compress_blob(data, size, o->data.payload, &rsize);
988 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
989 o->object.flags |= compression;
991 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
992 size, rsize, object_compressed_to_string(compression));
997 if (!compression && size > 0)
998 memcpy(o->data.payload, data, size);
1000 r = journal_file_link_data(f, o, p, hash);
1004 /* The linking might have altered the window, so let's
1005 * refresh our pointer */
1006 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1013 eq = memchr(data, '=', size);
1014 if (eq && eq > data) {
1018 /* Create field object ... */
1019 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1023 /* ... and link it in. */
1024 o->data.next_field_offset = fo->field.head_data_offset;
1025 fo->field.head_data_offset = le64toh(p);
1029 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1043 uint64_t journal_file_entry_n_items(Object *o) {
1046 if (o->object.type != OBJECT_ENTRY)
1049 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1052 uint64_t journal_file_entry_array_n_items(Object *o) {
1055 if (o->object.type != OBJECT_ENTRY_ARRAY)
1058 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1061 uint64_t journal_file_hash_table_n_items(Object *o) {
1064 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1065 o->object.type != OBJECT_FIELD_HASH_TABLE)
1068 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1071 static int link_entry_into_array(JournalFile *f,
1076 uint64_t n = 0, ap = 0, q, i, a, hidx;
1084 a = le64toh(*first);
1085 i = hidx = le64toh(*idx);
1088 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1092 n = journal_file_entry_array_n_items(o);
1094 o->entry_array.items[i] = htole64(p);
1095 *idx = htole64(hidx + 1);
1101 a = le64toh(o->entry_array.next_entry_array_offset);
1112 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1113 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1119 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1124 o->entry_array.items[i] = htole64(p);
1127 *first = htole64(q);
1129 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1133 o->entry_array.next_entry_array_offset = htole64(q);
1136 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1137 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1139 *idx = htole64(hidx + 1);
1144 static int link_entry_into_array_plus_one(JournalFile *f,
1159 *extra = htole64(p);
1163 i = htole64(le64toh(*idx) - 1);
1164 r = link_entry_into_array(f, first, &i, p);
1169 *idx = htole64(le64toh(*idx) + 1);
1173 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1180 p = le64toh(o->entry.items[i].object_offset);
1184 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1188 return link_entry_into_array_plus_one(f,
1189 &o->data.entry_offset,
1190 &o->data.entry_array_offset,
1195 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1203 if (o->object.type != OBJECT_ENTRY)
1206 __sync_synchronize();
1208 /* Link up the entry itself */
1209 r = link_entry_into_array(f,
1210 &f->header->entry_array_offset,
1211 &f->header->n_entries,
1216 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1218 if (f->header->head_entry_realtime == 0)
1219 f->header->head_entry_realtime = o->entry.realtime;
1221 f->header->tail_entry_realtime = o->entry.realtime;
1222 f->header->tail_entry_monotonic = o->entry.monotonic;
1224 f->tail_entry_monotonic_valid = true;
1226 /* Link up the items */
1227 n = journal_file_entry_n_items(o);
1228 for (i = 0; i < n; i++) {
1229 r = journal_file_link_entry_item(f, o, offset, i);
1237 static int journal_file_append_entry_internal(
1239 const dual_timestamp *ts,
1241 const EntryItem items[], unsigned n_items,
1243 Object **ret, uint64_t *offset) {
1250 assert(items || n_items == 0);
1253 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1255 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1259 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1260 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1261 o->entry.realtime = htole64(ts->realtime);
1262 o->entry.monotonic = htole64(ts->monotonic);
1263 o->entry.xor_hash = htole64(xor_hash);
1264 o->entry.boot_id = f->header->boot_id;
1267 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1272 r = journal_file_link_entry(f, o, np);
1285 void journal_file_post_change(JournalFile *f) {
1288 /* inotify() does not receive IN_MODIFY events from file
1289 * accesses done via mmap(). After each access we hence
1290 * trigger IN_MODIFY by truncating the journal file to its
1291 * current size which triggers IN_MODIFY. */
1293 __sync_synchronize();
1295 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1296 log_error("Failed to truncate file to its own size: %m");
1299 static int entry_item_cmp(const void *_a, const void *_b) {
1300 const EntryItem *a = _a, *b = _b;
1302 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1304 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1309 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1313 uint64_t xor_hash = 0;
1314 struct dual_timestamp _ts;
1317 assert(iovec || n_iovec == 0);
1320 dual_timestamp_get(&_ts);
1324 if (f->tail_entry_monotonic_valid &&
1325 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1329 r = journal_file_maybe_append_tag(f, ts->realtime);
1334 /* alloca() can't take 0, hence let's allocate at least one */
1335 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1337 for (i = 0; i < n_iovec; i++) {
1341 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1345 xor_hash ^= le64toh(o->data.hash);
1346 items[i].object_offset = htole64(p);
1347 items[i].hash = o->data.hash;
1350 /* Order by the position on disk, in order to improve seek
1351 * times for rotating media. */
1352 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1354 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1356 journal_file_post_change(f);
1361 typedef struct ChainCacheItem {
1362 uint64_t first; /* the array at the beginning of the chain */
1363 uint64_t array; /* the cached array */
1364 uint64_t begin; /* the first item in the cached array */
1365 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1366 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1369 static void chain_cache_put(
1376 uint64_t last_index) {
1379 /* If the chain item to cache for this chain is the
1380 * first one it's not worth caching anything */
1384 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1385 ci = hashmap_steal_first(h);
1387 ci = new(ChainCacheItem, 1);
1394 if (hashmap_put(h, &ci->first, ci) < 0) {
1399 assert(ci->first == first);
1404 ci->last_index = last_index;
1407 static int generic_array_get(
1411 Object **ret, uint64_t *offset) {
1414 uint64_t p = 0, a, t = 0;
1422 /* Try the chain cache first */
1423 ci = hashmap_get(f->chain_cache, &first);
1424 if (ci && i > ci->total) {
1433 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1437 k = journal_file_entry_array_n_items(o);
1439 p = le64toh(o->entry_array.items[i]);
1445 a = le64toh(o->entry_array.next_entry_array_offset);
1451 /* Let's cache this item for the next invocation */
1452 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1454 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1467 static int generic_array_get_plus_one(
1472 Object **ret, uint64_t *offset) {
1481 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1494 return generic_array_get(f, first, i-1, ret, offset);
1503 static int generic_array_bisect(
1508 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509 direction_t direction,
1514 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1515 bool subtract_one = false;
1516 Object *o, *array = NULL;
1521 assert(test_object);
1523 /* Start with the first array in the chain */
1526 ci = hashmap_get(f->chain_cache, &first);
1527 if (ci && n > ci->total) {
1528 /* Ah, we have iterated this bisection array chain
1529 * previously! Let's see if we can skip ahead in the
1530 * chain, as far as the last time. But we can't jump
1531 * backwards in the chain, so let's check that
1534 r = test_object(f, ci->begin, needle);
1538 if (r == TEST_LEFT) {
1539 /* OK, what we are looking for is right of the
1540 * begin of this EntryArray, so let's jump
1541 * straight to previously cached array in the
1547 last_index = ci->last_index;
1552 uint64_t left, right, k, lp;
1554 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1558 k = journal_file_entry_array_n_items(array);
1564 lp = p = le64toh(array->entry_array.items[i]);
1568 r = test_object(f, p, needle);
1572 if (r == TEST_FOUND)
1573 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1575 if (r == TEST_RIGHT) {
1579 if (last_index != (uint64_t) -1) {
1580 assert(last_index <= right);
1582 /* If we cached the last index we
1583 * looked at, let's try to not to jump
1584 * too wildly around and see if we can
1585 * limit the range to look at early to
1586 * the immediate neighbors of the last
1587 * index we looked at. */
1589 if (last_index > 0) {
1590 uint64_t x = last_index - 1;
1592 p = le64toh(array->entry_array.items[x]);
1596 r = test_object(f, p, needle);
1600 if (r == TEST_FOUND)
1601 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1603 if (r == TEST_RIGHT)
1609 if (last_index < right) {
1610 uint64_t y = last_index + 1;
1612 p = le64toh(array->entry_array.items[y]);
1616 r = test_object(f, p, needle);
1620 if (r == TEST_FOUND)
1621 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1623 if (r == TEST_RIGHT)
1631 if (left == right) {
1632 if (direction == DIRECTION_UP)
1633 subtract_one = true;
1639 assert(left < right);
1640 i = (left + right) / 2;
1642 p = le64toh(array->entry_array.items[i]);
1646 r = test_object(f, p, needle);
1650 if (r == TEST_FOUND)
1651 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1653 if (r == TEST_RIGHT)
1661 if (direction == DIRECTION_UP) {
1663 subtract_one = true;
1674 last_index = (uint64_t) -1;
1675 a = le64toh(array->entry_array.next_entry_array_offset);
1681 if (subtract_one && t == 0 && i == 0)
1684 /* Let's cache this item for the next invocation */
1685 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1687 if (subtract_one && i == 0)
1689 else if (subtract_one)
1690 p = le64toh(array->entry_array.items[i-1]);
1692 p = le64toh(array->entry_array.items[i]);
1694 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1705 *idx = t + i + (subtract_one ? -1 : 0);
1711 static int generic_array_bisect_plus_one(
1717 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1718 direction_t direction,
1724 bool step_back = false;
1728 assert(test_object);
1733 /* This bisects the array in object 'first', but first checks
1735 r = test_object(f, extra, needle);
1739 if (r == TEST_FOUND)
1740 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1742 /* if we are looking with DIRECTION_UP then we need to first
1743 see if in the actual array there is a matching entry, and
1744 return the last one of that. But if there isn't any we need
1745 to return this one. Hence remember this, and return it
1748 step_back = direction == DIRECTION_UP;
1750 if (r == TEST_RIGHT) {
1751 if (direction == DIRECTION_DOWN)
1757 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1759 if (r == 0 && step_back)
1768 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1784 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1790 else if (p < needle)
1796 int journal_file_move_to_entry_by_offset(
1799 direction_t direction,
1803 return generic_array_bisect(f,
1804 le64toh(f->header->entry_array_offset),
1805 le64toh(f->header->n_entries),
1813 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1820 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1824 if (le64toh(o->entry.seqnum) == needle)
1826 else if (le64toh(o->entry.seqnum) < needle)
1832 int journal_file_move_to_entry_by_seqnum(
1835 direction_t direction,
1839 return generic_array_bisect(f,
1840 le64toh(f->header->entry_array_offset),
1841 le64toh(f->header->n_entries),
1848 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1855 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1859 if (le64toh(o->entry.realtime) == needle)
1861 else if (le64toh(o->entry.realtime) < needle)
1867 int journal_file_move_to_entry_by_realtime(
1870 direction_t direction,
1874 return generic_array_bisect(f,
1875 le64toh(f->header->entry_array_offset),
1876 le64toh(f->header->n_entries),
1878 test_object_realtime,
1883 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1890 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1894 if (le64toh(o->entry.monotonic) == needle)
1896 else if (le64toh(o->entry.monotonic) < needle)
1902 static inline int find_data_object_by_boot_id(
1907 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1909 sd_id128_to_string(boot_id, t + 9);
1910 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1913 int journal_file_move_to_entry_by_monotonic(
1917 direction_t direction,
1926 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1932 return generic_array_bisect_plus_one(f,
1933 le64toh(o->data.entry_offset),
1934 le64toh(o->data.entry_array_offset),
1935 le64toh(o->data.n_entries),
1937 test_object_monotonic,
1942 int journal_file_next_entry(
1944 Object *o, uint64_t p,
1945 direction_t direction,
1946 Object **ret, uint64_t *offset) {
1952 assert(p > 0 || !o);
1954 n = le64toh(f->header->n_entries);
1959 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1961 if (o->object.type != OBJECT_ENTRY)
1964 r = generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1975 if (direction == DIRECTION_DOWN) {
1988 /* And jump to it */
1989 r = generic_array_get(f,
1990 le64toh(f->header->entry_array_offset),
1997 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1998 log_debug("%s: entry array corrupted at entry %"PRIu64,
2009 int journal_file_skip_entry(
2011 Object *o, uint64_t p,
2013 Object **ret, uint64_t *offset) {
2022 if (o->object.type != OBJECT_ENTRY)
2025 r = generic_array_bisect(f,
2026 le64toh(f->header->entry_array_offset),
2027 le64toh(f->header->n_entries),
2036 /* Calculate new index */
2038 if ((uint64_t) -skip >= i)
2041 i = i - (uint64_t) -skip;
2043 i += (uint64_t) skip;
2045 n = le64toh(f->header->n_entries);
2052 return generic_array_get(f,
2053 le64toh(f->header->entry_array_offset),
2058 int journal_file_next_entry_for_data(
2060 Object *o, uint64_t p,
2061 uint64_t data_offset,
2062 direction_t direction,
2063 Object **ret, uint64_t *offset) {
2070 assert(p > 0 || !o);
2072 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2076 n = le64toh(d->data.n_entries);
2081 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2083 if (o->object.type != OBJECT_ENTRY)
2086 r = generic_array_bisect_plus_one(f,
2087 le64toh(d->data.entry_offset),
2088 le64toh(d->data.entry_array_offset),
2089 le64toh(d->data.n_entries),
2099 if (direction == DIRECTION_DOWN) {
2113 return generic_array_get_plus_one(f,
2114 le64toh(d->data.entry_offset),
2115 le64toh(d->data.entry_array_offset),
2120 int journal_file_move_to_entry_by_offset_for_data(
2122 uint64_t data_offset,
2124 direction_t direction,
2125 Object **ret, uint64_t *offset) {
2132 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2136 return generic_array_bisect_plus_one(f,
2137 le64toh(d->data.entry_offset),
2138 le64toh(d->data.entry_array_offset),
2139 le64toh(d->data.n_entries),
2146 int journal_file_move_to_entry_by_monotonic_for_data(
2148 uint64_t data_offset,
2151 direction_t direction,
2152 Object **ret, uint64_t *offset) {
2160 /* First, seek by time */
2161 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2167 r = generic_array_bisect_plus_one(f,
2168 le64toh(o->data.entry_offset),
2169 le64toh(o->data.entry_array_offset),
2170 le64toh(o->data.n_entries),
2172 test_object_monotonic,
2178 /* And now, continue seeking until we find an entry that
2179 * exists in both bisection arrays */
2185 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2189 r = generic_array_bisect_plus_one(f,
2190 le64toh(d->data.entry_offset),
2191 le64toh(d->data.entry_array_offset),
2192 le64toh(d->data.n_entries),
2200 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2204 r = generic_array_bisect_plus_one(f,
2205 le64toh(o->data.entry_offset),
2206 le64toh(o->data.entry_array_offset),
2207 le64toh(o->data.n_entries),
2229 int journal_file_move_to_entry_by_seqnum_for_data(
2231 uint64_t data_offset,
2233 direction_t direction,
2234 Object **ret, uint64_t *offset) {
2241 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2245 return generic_array_bisect_plus_one(f,
2246 le64toh(d->data.entry_offset),
2247 le64toh(d->data.entry_array_offset),
2248 le64toh(d->data.n_entries),
2255 int journal_file_move_to_entry_by_realtime_for_data(
2257 uint64_t data_offset,
2259 direction_t direction,
2260 Object **ret, uint64_t *offset) {
2267 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2271 return generic_array_bisect_plus_one(f,
2272 le64toh(d->data.entry_offset),
2273 le64toh(d->data.entry_array_offset),
2274 le64toh(d->data.n_entries),
2276 test_object_realtime,
2281 void journal_file_dump(JournalFile *f) {
2288 journal_file_print_header(f);
2290 p = le64toh(f->header->header_size);
2292 r = journal_file_move_to_object(f, -1, p, &o);
2296 switch (o->object.type) {
2299 printf("Type: OBJECT_UNUSED\n");
2303 printf("Type: OBJECT_DATA\n");
2307 printf("Type: OBJECT_FIELD\n");
2311 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2312 le64toh(o->entry.seqnum),
2313 le64toh(o->entry.monotonic),
2314 le64toh(o->entry.realtime));
2317 case OBJECT_FIELD_HASH_TABLE:
2318 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2321 case OBJECT_DATA_HASH_TABLE:
2322 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2325 case OBJECT_ENTRY_ARRAY:
2326 printf("Type: OBJECT_ENTRY_ARRAY\n");
2330 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2331 le64toh(o->tag.seqnum),
2332 le64toh(o->tag.epoch));
2336 printf("Type: unknown (%u)\n", o->object.type);
2340 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2341 printf("Flags: %s\n",
2342 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2344 if (p == le64toh(f->header->tail_object_offset))
2347 p = p + ALIGN64(le64toh(o->object.size));
2352 log_error("File corrupt");
2355 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2358 x = format_timestamp(buf, l, t);
2364 void journal_file_print_header(JournalFile *f) {
2365 char a[33], b[33], c[33], d[33];
2366 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2368 char bytes[FORMAT_BYTES_MAX];
2372 printf("File Path: %s\n"
2376 "Sequential Number ID: %s\n"
2378 "Compatible Flags:%s%s\n"
2379 "Incompatible Flags:%s%s%s\n"
2380 "Header size: %"PRIu64"\n"
2381 "Arena size: %"PRIu64"\n"
2382 "Data Hash Table Size: %"PRIu64"\n"
2383 "Field Hash Table Size: %"PRIu64"\n"
2384 "Rotate Suggested: %s\n"
2385 "Head Sequential Number: %"PRIu64"\n"
2386 "Tail Sequential Number: %"PRIu64"\n"
2387 "Head Realtime Timestamp: %s\n"
2388 "Tail Realtime Timestamp: %s\n"
2389 "Tail Monotonic Timestamp: %s\n"
2390 "Objects: %"PRIu64"\n"
2391 "Entry Objects: %"PRIu64"\n",
2393 sd_id128_to_string(f->header->file_id, a),
2394 sd_id128_to_string(f->header->machine_id, b),
2395 sd_id128_to_string(f->header->boot_id, c),
2396 sd_id128_to_string(f->header->seqnum_id, d),
2397 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2398 f->header->state == STATE_ONLINE ? "ONLINE" :
2399 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2400 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2401 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2402 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2403 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2404 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2405 le64toh(f->header->header_size),
2406 le64toh(f->header->arena_size),
2407 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2408 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2409 yes_no(journal_file_rotate_suggested(f, 0)),
2410 le64toh(f->header->head_entry_seqnum),
2411 le64toh(f->header->tail_entry_seqnum),
2412 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2413 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2414 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2415 le64toh(f->header->n_objects),
2416 le64toh(f->header->n_entries));
2418 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2419 printf("Data Objects: %"PRIu64"\n"
2420 "Data Hash Table Fill: %.1f%%\n",
2421 le64toh(f->header->n_data),
2422 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2424 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2425 printf("Field Objects: %"PRIu64"\n"
2426 "Field Hash Table Fill: %.1f%%\n",
2427 le64toh(f->header->n_fields),
2428 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2430 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2431 printf("Tag Objects: %"PRIu64"\n",
2432 le64toh(f->header->n_tags));
2433 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2434 printf("Entry Array Objects: %"PRIu64"\n",
2435 le64toh(f->header->n_entry_arrays));
2437 if (fstat(f->fd, &st) >= 0)
2438 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2441 int journal_file_open(
2447 JournalMetrics *metrics,
2448 MMapCache *mmap_cache,
2449 JournalFile *template,
2450 JournalFile **ret) {
2454 bool newly_created = false;
2459 if ((flags & O_ACCMODE) != O_RDONLY &&
2460 (flags & O_ACCMODE) != O_RDWR)
2463 if (!endswith(fname, ".journal") &&
2464 !endswith(fname, ".journal~"))
2467 f = new0(JournalFile, 1);
2475 f->prot = prot_from_flags(flags);
2476 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2477 #if defined(HAVE_LZ4)
2478 f->compress_lz4 = compress;
2479 #elif defined(HAVE_XZ)
2480 f->compress_xz = compress;
2487 f->mmap = mmap_cache_ref(mmap_cache);
2489 f->mmap = mmap_cache_new();
2496 f->path = strdup(fname);
2502 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2503 if (!f->chain_cache) {
2508 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2514 if (fstat(f->fd, &f->last_stat) < 0) {
2519 if (f->last_stat.st_size == 0 && f->writable) {
2522 /* Let's attach the creation time to the journal file,
2523 * so that the vacuuming code knows the age of this
2524 * file even if the file might end up corrupted one
2525 * day... Ideally we'd just use the creation time many
2526 * file systems maintain for each file, but there is
2527 * currently no usable API to query this, hence let's
2528 * emulate this via extended attributes. If extended
2529 * attributes are not supported we'll just skip this,
2530 * and rely solely on mtime/atime/ctime of the file.*/
2532 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2533 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2536 /* Try to load the FSPRG state, and if we can't, then
2537 * just don't do sealing */
2539 r = journal_file_fss_load(f);
2545 r = journal_file_init_header(f, template);
2549 if (fstat(f->fd, &f->last_stat) < 0) {
2554 newly_created = true;
2557 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2562 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2563 if (f->header == MAP_FAILED) {
2569 if (!newly_created) {
2570 r = journal_file_verify_header(f);
2576 if (!newly_created && f->writable) {
2577 r = journal_file_fss_load(f);
2585 journal_default_metrics(metrics, f->fd);
2586 f->metrics = *metrics;
2587 } else if (template)
2588 f->metrics = template->metrics;
2590 r = journal_file_refresh_header(f);
2596 r = journal_file_hmac_setup(f);
2601 if (newly_created) {
2602 r = journal_file_setup_field_hash_table(f);
2606 r = journal_file_setup_data_hash_table(f);
2611 r = journal_file_append_first_tag(f);
2617 r = journal_file_map_field_hash_table(f);
2621 r = journal_file_map_data_hash_table(f);
2629 journal_file_close(f);
2634 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2635 _cleanup_free_ char *p = NULL;
2637 JournalFile *old_file, *new_file = NULL;
2645 if (!old_file->writable)
2648 if (!endswith(old_file->path, ".journal"))
2651 l = strlen(old_file->path);
2652 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2653 (int) l - 8, old_file->path,
2654 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2655 le64toh((*f)->header->head_entry_seqnum),
2656 le64toh((*f)->header->head_entry_realtime));
2660 r = rename(old_file->path, p);
2664 old_file->header->state = STATE_ARCHIVED;
2666 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2667 journal_file_close(old_file);
2673 int journal_file_open_reliably(
2679 JournalMetrics *metrics,
2680 MMapCache *mmap_cache,
2681 JournalFile *template,
2682 JournalFile **ret) {
2686 _cleanup_free_ char *p = NULL;
2688 r = journal_file_open(fname, flags, mode, compress, seal,
2689 metrics, mmap_cache, template, ret);
2690 if (r != -EBADMSG && /* corrupted */
2691 r != -ENODATA && /* truncated */
2692 r != -EHOSTDOWN && /* other machine */
2693 r != -EPROTONOSUPPORT && /* incompatible feature */
2694 r != -EBUSY && /* unclean shutdown */
2695 r != -ESHUTDOWN /* already archived */)
2698 if ((flags & O_ACCMODE) == O_RDONLY)
2701 if (!(flags & O_CREAT))
2704 if (!endswith(fname, ".journal"))
2707 /* The file is corrupted. Rotate it away and try it again (but only once) */
2710 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2712 (unsigned long long) now(CLOCK_REALTIME),
2716 r = rename(fname, p);
2720 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2722 return journal_file_open(fname, flags, mode, compress, seal,
2723 metrics, mmap_cache, template, ret);
2726 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2728 uint64_t q, xor_hash = 0;
2741 ts.monotonic = le64toh(o->entry.monotonic);
2742 ts.realtime = le64toh(o->entry.realtime);
2744 n = journal_file_entry_n_items(o);
2745 /* alloca() can't take 0, hence let's allocate at least one */
2746 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2748 for (i = 0; i < n; i++) {
2755 q = le64toh(o->entry.items[i].object_offset);
2756 le_hash = o->entry.items[i].hash;
2758 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2762 if (le_hash != o->data.hash)
2765 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2768 /* We hit the limit on 32bit machines */
2769 if ((uint64_t) t != l)
2772 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2773 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2776 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2777 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2781 data = from->compress_buffer;
2784 return -EPROTONOSUPPORT;
2787 data = o->data.payload;
2789 r = journal_file_append_data(to, data, l, &u, &h);
2793 xor_hash ^= le64toh(u->data.hash);
2794 items[i].object_offset = htole64(h);
2795 items[i].hash = u->data.hash;
2797 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2802 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2805 void journal_default_metrics(JournalMetrics *m, int fd) {
2806 uint64_t fs_size = 0;
2808 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2813 if (fstatvfs(fd, &ss) >= 0)
2814 fs_size = ss.f_frsize * ss.f_blocks;
2816 if (m->max_use == (uint64_t) -1) {
2819 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2821 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2822 m->max_use = DEFAULT_MAX_USE_UPPER;
2824 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2825 m->max_use = DEFAULT_MAX_USE_LOWER;
2827 m->max_use = DEFAULT_MAX_USE_LOWER;
2829 m->max_use = PAGE_ALIGN(m->max_use);
2831 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2832 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2835 if (m->max_size == (uint64_t) -1) {
2836 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2838 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2839 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2841 m->max_size = PAGE_ALIGN(m->max_size);
2843 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2844 m->max_size = JOURNAL_FILE_SIZE_MIN;
2846 if (m->max_size*2 > m->max_use)
2847 m->max_use = m->max_size*2;
2849 if (m->min_size == (uint64_t) -1)
2850 m->min_size = JOURNAL_FILE_SIZE_MIN;
2852 m->min_size = PAGE_ALIGN(m->min_size);
2854 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2855 m->min_size = JOURNAL_FILE_SIZE_MIN;
2857 if (m->min_size > m->max_size)
2858 m->max_size = m->min_size;
2861 if (m->keep_free == (uint64_t) -1) {
2864 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2866 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2867 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2870 m->keep_free = DEFAULT_KEEP_FREE;
2873 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2874 format_bytes(a, sizeof(a), m->max_use),
2875 format_bytes(b, sizeof(b), m->max_size),
2876 format_bytes(c, sizeof(c), m->min_size),
2877 format_bytes(d, sizeof(d), m->keep_free));
2880 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2885 if (f->header->head_entry_realtime == 0)
2888 *from = le64toh(f->header->head_entry_realtime);
2892 if (f->header->tail_entry_realtime == 0)
2895 *to = le64toh(f->header->tail_entry_realtime);
2901 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2909 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2913 if (le64toh(o->data.n_entries) <= 0)
2917 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2921 *from = le64toh(o->entry.monotonic);
2925 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2929 r = generic_array_get_plus_one(f,
2930 le64toh(o->data.entry_offset),
2931 le64toh(o->data.entry_array_offset),
2932 le64toh(o->data.n_entries)-1,
2937 *to = le64toh(o->entry.monotonic);
2943 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2946 /* If we gained new header fields we gained new features,
2947 * hence suggest a rotation */
2948 if (le64toh(f->header->header_size) < sizeof(Header)) {
2949 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2953 /* Let's check if the hash tables grew over a certain fill
2954 * level (75%, borrowing this value from Java's hash table
2955 * implementation), and if so suggest a rotation. To calculate
2956 * the fill level we need the n_data field, which only exists
2957 * in newer versions. */
2959 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2960 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2961 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2963 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2964 le64toh(f->header->n_data),
2965 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2966 (unsigned long long) f->last_stat.st_size,
2967 f->last_stat.st_size / le64toh(f->header->n_data));
2971 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2972 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2973 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2975 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2976 le64toh(f->header->n_fields),
2977 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2981 /* Are the data objects properly indexed by field objects? */
2982 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2983 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2984 le64toh(f->header->n_data) > 0 &&
2985 le64toh(f->header->n_fields) == 0)
2988 if (max_file_usec > 0) {
2991 h = le64toh(f->header->head_entry_realtime);
2992 t = now(CLOCK_REALTIME);
2994 if (h > 0 && t > h + max_file_usec)