1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
29 #include <sys/xattr.h>
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 static int journal_file_set_online(JournalFile *f) {
77 if (!(f->fd >= 0 && f->header))
80 switch(f->header->state) {
85 f->header->state = STATE_ONLINE;
94 int journal_file_set_offline(JournalFile *f) {
100 if (!(f->fd >= 0 && f->header))
103 if (f->header->state != STATE_ONLINE)
108 f->header->state = STATE_OFFLINE;
115 void journal_file_close(JournalFile *f) {
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
128 journal_file_set_offline(f);
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 mmap_cache_unref(f->mmap);
139 hashmap_free_free(f->chain_cache);
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142 free(f->compress_buffer);
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
154 gcry_md_close(f->hmac);
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
167 memcpy(h.signature, HEADER_SIGNATURE, 8);
168 h.header_size = htole64(ALIGN64(sizeof(h)));
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
177 r = sd_id128_randomize(&h.file_id);
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
185 h.seqnum_id = h.file_id;
187 k = pwrite(f->fd, &h, sizeof(h), 0);
197 static int journal_file_refresh_header(JournalFile *f) {
203 r = sd_id128_get_machine(&f->header->machine_id);
207 r = sd_id128_get_boot(&boot_id);
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
214 f->header->boot_id = boot_id;
216 journal_file_set_online(f);
218 /* Sync the online state to disk */
224 static int journal_file_verify_header(JournalFile *f) {
229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
243 return -EPROTONOSUPPORT;
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
260 if (f->header->state >= _STATE_MAX)
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
282 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
283 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
284 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
285 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
290 sd_id128_t machine_id;
293 r = sd_id128_get_machine(&machine_id);
297 if (!sd_id128_equal(machine_id, f->header->machine_id))
300 state = f->header->state;
302 if (state == STATE_ONLINE) {
303 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
305 } else if (state == STATE_ARCHIVED)
307 else if (state != STATE_OFFLINE) {
308 log_debug("Journal file %s has unknown state %u.", f->path, state);
313 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
314 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
316 f->seal = JOURNAL_HEADER_SEALED(f->header);
321 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
322 uint64_t old_size, new_size;
327 /* We assume that this file is not sparse, and we know that
328 * for sure, since we always call posix_fallocate()
332 le64toh(f->header->header_size) +
333 le64toh(f->header->arena_size);
335 new_size = PAGE_ALIGN(offset + size);
336 if (new_size < le64toh(f->header->header_size))
337 new_size = le64toh(f->header->header_size);
339 if (new_size <= old_size)
342 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
345 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
348 if (fstatvfs(f->fd, &svfs) >= 0) {
351 available = svfs.f_bfree * svfs.f_bsize;
353 if (available >= f->metrics.keep_free)
354 available -= f->metrics.keep_free;
358 if (new_size - old_size > available)
363 /* Increase by larger blocks at once */
364 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
365 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
366 new_size = f->metrics.max_size;
368 /* Note that the glibc fallocate() fallback is very
369 inefficient, hence we try to minimize the allocation area
371 r = posix_fallocate(f->fd, old_size, new_size - old_size);
375 if (fstat(f->fd, &f->last_stat) < 0)
378 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
383 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
390 /* Avoid SIGBUS on invalid accesses */
391 if (offset + size > (uint64_t) f->last_stat.st_size) {
392 /* Hmm, out of range? Let's refresh the fstat() data
393 * first, before we trust that check. */
395 if (fstat(f->fd, &f->last_stat) < 0 ||
396 offset + size > (uint64_t) f->last_stat.st_size)
397 return -EADDRNOTAVAIL;
400 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
403 static uint64_t minimum_header_size(Object *o) {
405 static const uint64_t table[] = {
406 [OBJECT_DATA] = sizeof(DataObject),
407 [OBJECT_FIELD] = sizeof(FieldObject),
408 [OBJECT_ENTRY] = sizeof(EntryObject),
409 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
411 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
412 [OBJECT_TAG] = sizeof(TagObject),
415 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
416 return sizeof(ObjectHeader);
418 return table[o->object.type];
421 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
435 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
440 s = le64toh(o->object.size);
442 if (s < sizeof(ObjectHeader))
445 if (o->object.type <= OBJECT_UNUSED)
448 if (s < minimum_header_size(o))
451 if (type > 0 && o->object.type != type)
454 if (s > sizeof(ObjectHeader)) {
455 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
466 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
471 r = le64toh(f->header->tail_entry_seqnum) + 1;
474 /* If an external seqnum counter was passed, we update
475 * both the local and the external one, and set it to
476 * the maximum of both */
484 f->header->tail_entry_seqnum = htole64(r);
486 if (f->header->head_entry_seqnum == 0)
487 f->header->head_entry_seqnum = htole64(r);
492 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
499 assert(type > 0 && type < _OBJECT_TYPE_MAX);
500 assert(size >= sizeof(ObjectHeader));
504 r = journal_file_set_online(f);
508 p = le64toh(f->header->tail_object_offset);
510 p = le64toh(f->header->header_size);
512 r = journal_file_move_to_object(f, -1, p, &tail);
516 p += ALIGN64(le64toh(tail->object.size));
519 r = journal_file_allocate(f, p, size);
523 r = journal_file_move_to(f, type, false, p, size, &t);
530 o->object.type = type;
531 o->object.size = htole64(size);
533 f->header->tail_object_offset = htole64(p);
534 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
542 static int journal_file_setup_data_hash_table(JournalFile *f) {
549 /* We estimate that we need 1 hash table entry per 768 of
550 journal file and we want to make sure we never get beyond
551 75% fill level. Calculate the hash table size for the
552 maximum file size based on these metrics. */
554 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
555 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
556 s = DEFAULT_DATA_HASH_TABLE_SIZE;
558 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
560 r = journal_file_append_object(f,
561 OBJECT_DATA_HASH_TABLE,
562 offsetof(Object, hash_table.items) + s,
567 memzero(o->hash_table.items, s);
569 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
570 f->header->data_hash_table_size = htole64(s);
575 static int journal_file_setup_field_hash_table(JournalFile *f) {
582 /* We use a fixed size hash table for the fields as this
583 * number should grow very slowly only */
585 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
586 r = journal_file_append_object(f,
587 OBJECT_FIELD_HASH_TABLE,
588 offsetof(Object, hash_table.items) + s,
593 memzero(o->hash_table.items, s);
595 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
596 f->header->field_hash_table_size = htole64(s);
601 static int journal_file_map_data_hash_table(JournalFile *f) {
608 p = le64toh(f->header->data_hash_table_offset);
609 s = le64toh(f->header->data_hash_table_size);
611 r = journal_file_move_to(f,
612 OBJECT_DATA_HASH_TABLE,
619 f->data_hash_table = t;
623 static int journal_file_map_field_hash_table(JournalFile *f) {
630 p = le64toh(f->header->field_hash_table_offset);
631 s = le64toh(f->header->field_hash_table_size);
633 r = journal_file_move_to(f,
634 OBJECT_FIELD_HASH_TABLE,
641 f->field_hash_table = t;
645 static int journal_file_link_field(
658 if (o->object.type != OBJECT_FIELD)
661 /* This might alter the window we are looking at */
663 o->field.next_hash_offset = o->field.head_data_offset = 0;
665 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
666 p = le64toh(f->field_hash_table[h].tail_hash_offset);
668 f->field_hash_table[h].head_hash_offset = htole64(offset);
670 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
674 o->field.next_hash_offset = htole64(offset);
677 f->field_hash_table[h].tail_hash_offset = htole64(offset);
679 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
680 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
685 static int journal_file_link_data(
698 if (o->object.type != OBJECT_DATA)
701 /* This might alter the window we are looking at */
703 o->data.next_hash_offset = o->data.next_field_offset = 0;
704 o->data.entry_offset = o->data.entry_array_offset = 0;
705 o->data.n_entries = 0;
707 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
708 p = le64toh(f->data_hash_table[h].tail_hash_offset);
710 /* Only entry in the hash table is easy */
711 f->data_hash_table[h].head_hash_offset = htole64(offset);
713 /* Move back to the previous data object, to patch in
716 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
720 o->data.next_hash_offset = htole64(offset);
723 f->data_hash_table[h].tail_hash_offset = htole64(offset);
725 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
726 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
731 int journal_file_find_field_object_with_hash(
733 const void *field, uint64_t size, uint64_t hash,
734 Object **ret, uint64_t *offset) {
736 uint64_t p, osize, h;
740 assert(field && size > 0);
742 osize = offsetof(Object, field.payload) + size;
744 if (f->header->field_hash_table_size == 0)
747 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
748 p = le64toh(f->field_hash_table[h].head_hash_offset);
753 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
757 if (le64toh(o->field.hash) == hash &&
758 le64toh(o->object.size) == osize &&
759 memcmp(o->field.payload, field, size) == 0) {
769 p = le64toh(o->field.next_hash_offset);
775 int journal_file_find_field_object(
777 const void *field, uint64_t size,
778 Object **ret, uint64_t *offset) {
783 assert(field && size > 0);
785 hash = hash64(field, size);
787 return journal_file_find_field_object_with_hash(f,
792 int journal_file_find_data_object_with_hash(
794 const void *data, uint64_t size, uint64_t hash,
795 Object **ret, uint64_t *offset) {
797 uint64_t p, osize, h;
801 assert(data || size == 0);
803 osize = offsetof(Object, data.payload) + size;
805 if (f->header->data_hash_table_size == 0)
808 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
809 p = le64toh(f->data_hash_table[h].head_hash_offset);
814 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
818 if (le64toh(o->data.hash) != hash)
821 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
824 l = le64toh(o->object.size);
825 if (l <= offsetof(Object, data.payload))
828 l -= offsetof(Object, data.payload);
830 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
831 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
836 memcmp(f->compress_buffer, data, size) == 0) {
847 } else if (le64toh(o->object.size) == osize &&
848 memcmp(o->data.payload, data, size) == 0) {
860 p = le64toh(o->data.next_hash_offset);
866 int journal_file_find_data_object(
868 const void *data, uint64_t size,
869 Object **ret, uint64_t *offset) {
874 assert(data || size == 0);
876 hash = hash64(data, size);
878 return journal_file_find_data_object_with_hash(f,
883 static int journal_file_append_field(
885 const void *field, uint64_t size,
886 Object **ret, uint64_t *offset) {
894 assert(field && size > 0);
896 hash = hash64(field, size);
898 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
912 osize = offsetof(Object, field.payload) + size;
913 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
917 o->field.hash = htole64(hash);
918 memcpy(o->field.payload, field, size);
920 r = journal_file_link_field(f, o, p, hash);
924 /* The linking might have altered the window, so let's
925 * refresh our pointer */
926 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
931 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
945 static int journal_file_append_data(
947 const void *data, uint64_t size,
948 Object **ret, uint64_t *offset) {
953 int r, compression = 0;
957 assert(data || size == 0);
959 hash = hash64(data, size);
961 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
975 osize = offsetof(Object, data.payload) + size;
976 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
980 o->data.hash = htole64(hash);
982 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
983 if (f->compress_xz &&
984 size >= COMPRESSION_SIZE_THRESHOLD) {
987 compression = compress_blob(data, size, o->data.payload, &rsize);
990 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
991 o->object.flags |= compression;
993 log_debug("Compressed data object %"PRIu64" -> %"PRIu64" using %s",
994 size, rsize, object_compressed_to_string(compression));
999 if (!compression && size > 0)
1000 memcpy(o->data.payload, data, size);
1002 r = journal_file_link_data(f, o, p, hash);
1006 /* The linking might have altered the window, so let's
1007 * refresh our pointer */
1008 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1015 eq = memchr(data, '=', size);
1016 if (eq && eq > data) {
1020 /* Create field object ... */
1021 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1025 /* ... and link it in. */
1026 o->data.next_field_offset = fo->field.head_data_offset;
1027 fo->field.head_data_offset = le64toh(p);
1031 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1045 uint64_t journal_file_entry_n_items(Object *o) {
1048 if (o->object.type != OBJECT_ENTRY)
1051 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1054 uint64_t journal_file_entry_array_n_items(Object *o) {
1057 if (o->object.type != OBJECT_ENTRY_ARRAY)
1060 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1063 uint64_t journal_file_hash_table_n_items(Object *o) {
1066 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1067 o->object.type != OBJECT_FIELD_HASH_TABLE)
1070 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1073 static int link_entry_into_array(JournalFile *f,
1078 uint64_t n = 0, ap = 0, q, i, a, hidx;
1086 a = le64toh(*first);
1087 i = hidx = le64toh(*idx);
1090 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1094 n = journal_file_entry_array_n_items(o);
1096 o->entry_array.items[i] = htole64(p);
1097 *idx = htole64(hidx + 1);
1103 a = le64toh(o->entry_array.next_entry_array_offset);
1114 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1115 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1121 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1126 o->entry_array.items[i] = htole64(p);
1129 *first = htole64(q);
1131 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1135 o->entry_array.next_entry_array_offset = htole64(q);
1138 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1139 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1141 *idx = htole64(hidx + 1);
1146 static int link_entry_into_array_plus_one(JournalFile *f,
1161 *extra = htole64(p);
1165 i = htole64(le64toh(*idx) - 1);
1166 r = link_entry_into_array(f, first, &i, p);
1171 *idx = htole64(le64toh(*idx) + 1);
1175 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1182 p = le64toh(o->entry.items[i].object_offset);
1186 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1190 return link_entry_into_array_plus_one(f,
1191 &o->data.entry_offset,
1192 &o->data.entry_array_offset,
1197 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1205 if (o->object.type != OBJECT_ENTRY)
1208 __sync_synchronize();
1210 /* Link up the entry itself */
1211 r = link_entry_into_array(f,
1212 &f->header->entry_array_offset,
1213 &f->header->n_entries,
1218 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1220 if (f->header->head_entry_realtime == 0)
1221 f->header->head_entry_realtime = o->entry.realtime;
1223 f->header->tail_entry_realtime = o->entry.realtime;
1224 f->header->tail_entry_monotonic = o->entry.monotonic;
1226 f->tail_entry_monotonic_valid = true;
1228 /* Link up the items */
1229 n = journal_file_entry_n_items(o);
1230 for (i = 0; i < n; i++) {
1231 r = journal_file_link_entry_item(f, o, offset, i);
1239 static int journal_file_append_entry_internal(
1241 const dual_timestamp *ts,
1243 const EntryItem items[], unsigned n_items,
1245 Object **ret, uint64_t *offset) {
1252 assert(items || n_items == 0);
1255 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1257 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1261 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1262 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1263 o->entry.realtime = htole64(ts->realtime);
1264 o->entry.monotonic = htole64(ts->monotonic);
1265 o->entry.xor_hash = htole64(xor_hash);
1266 o->entry.boot_id = f->header->boot_id;
1269 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1274 r = journal_file_link_entry(f, o, np);
1287 void journal_file_post_change(JournalFile *f) {
1290 /* inotify() does not receive IN_MODIFY events from file
1291 * accesses done via mmap(). After each access we hence
1292 * trigger IN_MODIFY by truncating the journal file to its
1293 * current size which triggers IN_MODIFY. */
1295 __sync_synchronize();
1297 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1298 log_error("Failed to truncate file to its own size: %m");
1301 static int entry_item_cmp(const void *_a, const void *_b) {
1302 const EntryItem *a = _a, *b = _b;
1304 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1306 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1311 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1315 uint64_t xor_hash = 0;
1316 struct dual_timestamp _ts;
1319 assert(iovec || n_iovec == 0);
1322 dual_timestamp_get(&_ts);
1326 if (f->tail_entry_monotonic_valid &&
1327 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1331 r = journal_file_maybe_append_tag(f, ts->realtime);
1336 /* alloca() can't take 0, hence let's allocate at least one */
1337 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1339 for (i = 0; i < n_iovec; i++) {
1343 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1347 xor_hash ^= le64toh(o->data.hash);
1348 items[i].object_offset = htole64(p);
1349 items[i].hash = o->data.hash;
1352 /* Order by the position on disk, in order to improve seek
1353 * times for rotating media. */
1354 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1356 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1358 journal_file_post_change(f);
1363 typedef struct ChainCacheItem {
1364 uint64_t first; /* the array at the beginning of the chain */
1365 uint64_t array; /* the cached array */
1366 uint64_t begin; /* the first item in the cached array */
1367 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1368 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1371 static void chain_cache_put(
1378 uint64_t last_index) {
1381 /* If the chain item to cache for this chain is the
1382 * first one it's not worth caching anything */
1386 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1387 ci = hashmap_steal_first(h);
1389 ci = new(ChainCacheItem, 1);
1396 if (hashmap_put(h, &ci->first, ci) < 0) {
1401 assert(ci->first == first);
1406 ci->last_index = last_index;
1409 static int generic_array_get(
1413 Object **ret, uint64_t *offset) {
1416 uint64_t p = 0, a, t = 0;
1424 /* Try the chain cache first */
1425 ci = hashmap_get(f->chain_cache, &first);
1426 if (ci && i > ci->total) {
1435 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1439 k = journal_file_entry_array_n_items(o);
1441 p = le64toh(o->entry_array.items[i]);
1447 a = le64toh(o->entry_array.next_entry_array_offset);
1453 /* Let's cache this item for the next invocation */
1454 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1456 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1469 static int generic_array_get_plus_one(
1474 Object **ret, uint64_t *offset) {
1483 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1496 return generic_array_get(f, first, i-1, ret, offset);
1505 static int generic_array_bisect(
1510 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1511 direction_t direction,
1516 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1517 bool subtract_one = false;
1518 Object *o, *array = NULL;
1523 assert(test_object);
1525 /* Start with the first array in the chain */
1528 ci = hashmap_get(f->chain_cache, &first);
1529 if (ci && n > ci->total) {
1530 /* Ah, we have iterated this bisection array chain
1531 * previously! Let's see if we can skip ahead in the
1532 * chain, as far as the last time. But we can't jump
1533 * backwards in the chain, so let's check that
1536 r = test_object(f, ci->begin, needle);
1540 if (r == TEST_LEFT) {
1541 /* OK, what we are looking for is right of the
1542 * begin of this EntryArray, so let's jump
1543 * straight to previously cached array in the
1549 last_index = ci->last_index;
1554 uint64_t left, right, k, lp;
1556 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1560 k = journal_file_entry_array_n_items(array);
1566 lp = p = le64toh(array->entry_array.items[i]);
1570 r = test_object(f, p, needle);
1574 if (r == TEST_FOUND)
1575 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1577 if (r == TEST_RIGHT) {
1581 if (last_index != (uint64_t) -1) {
1582 assert(last_index <= right);
1584 /* If we cached the last index we
1585 * looked at, let's try to not to jump
1586 * too wildly around and see if we can
1587 * limit the range to look at early to
1588 * the immediate neighbors of the last
1589 * index we looked at. */
1591 if (last_index > 0) {
1592 uint64_t x = last_index - 1;
1594 p = le64toh(array->entry_array.items[x]);
1598 r = test_object(f, p, needle);
1602 if (r == TEST_FOUND)
1603 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1605 if (r == TEST_RIGHT)
1611 if (last_index < right) {
1612 uint64_t y = last_index + 1;
1614 p = le64toh(array->entry_array.items[y]);
1618 r = test_object(f, p, needle);
1622 if (r == TEST_FOUND)
1623 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1625 if (r == TEST_RIGHT)
1633 if (left == right) {
1634 if (direction == DIRECTION_UP)
1635 subtract_one = true;
1641 assert(left < right);
1642 i = (left + right) / 2;
1644 p = le64toh(array->entry_array.items[i]);
1648 r = test_object(f, p, needle);
1652 if (r == TEST_FOUND)
1653 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1655 if (r == TEST_RIGHT)
1663 if (direction == DIRECTION_UP) {
1665 subtract_one = true;
1676 last_index = (uint64_t) -1;
1677 a = le64toh(array->entry_array.next_entry_array_offset);
1683 if (subtract_one && t == 0 && i == 0)
1686 /* Let's cache this item for the next invocation */
1687 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1689 if (subtract_one && i == 0)
1691 else if (subtract_one)
1692 p = le64toh(array->entry_array.items[i-1]);
1694 p = le64toh(array->entry_array.items[i]);
1696 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1707 *idx = t + i + (subtract_one ? -1 : 0);
1713 static int generic_array_bisect_plus_one(
1719 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1720 direction_t direction,
1726 bool step_back = false;
1730 assert(test_object);
1735 /* This bisects the array in object 'first', but first checks
1737 r = test_object(f, extra, needle);
1741 if (r == TEST_FOUND)
1742 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1744 /* if we are looking with DIRECTION_UP then we need to first
1745 see if in the actual array there is a matching entry, and
1746 return the last one of that. But if there isn't any we need
1747 to return this one. Hence remember this, and return it
1750 step_back = direction == DIRECTION_UP;
1752 if (r == TEST_RIGHT) {
1753 if (direction == DIRECTION_DOWN)
1759 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1761 if (r == 0 && step_back)
1770 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1786 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1792 else if (p < needle)
1798 int journal_file_move_to_entry_by_offset(
1801 direction_t direction,
1805 return generic_array_bisect(f,
1806 le64toh(f->header->entry_array_offset),
1807 le64toh(f->header->n_entries),
1815 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1822 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1826 if (le64toh(o->entry.seqnum) == needle)
1828 else if (le64toh(o->entry.seqnum) < needle)
1834 int journal_file_move_to_entry_by_seqnum(
1837 direction_t direction,
1841 return generic_array_bisect(f,
1842 le64toh(f->header->entry_array_offset),
1843 le64toh(f->header->n_entries),
1850 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1857 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1861 if (le64toh(o->entry.realtime) == needle)
1863 else if (le64toh(o->entry.realtime) < needle)
1869 int journal_file_move_to_entry_by_realtime(
1872 direction_t direction,
1876 return generic_array_bisect(f,
1877 le64toh(f->header->entry_array_offset),
1878 le64toh(f->header->n_entries),
1880 test_object_realtime,
1885 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1892 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1896 if (le64toh(o->entry.monotonic) == needle)
1898 else if (le64toh(o->entry.monotonic) < needle)
1904 static inline int find_data_object_by_boot_id(
1909 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1911 sd_id128_to_string(boot_id, t + 9);
1912 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1915 int journal_file_move_to_entry_by_monotonic(
1919 direction_t direction,
1928 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1934 return generic_array_bisect_plus_one(f,
1935 le64toh(o->data.entry_offset),
1936 le64toh(o->data.entry_array_offset),
1937 le64toh(o->data.n_entries),
1939 test_object_monotonic,
1944 int journal_file_next_entry(
1946 Object *o, uint64_t p,
1947 direction_t direction,
1948 Object **ret, uint64_t *offset) {
1954 assert(p > 0 || !o);
1956 n = le64toh(f->header->n_entries);
1961 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1963 if (o->object.type != OBJECT_ENTRY)
1966 r = generic_array_bisect(f,
1967 le64toh(f->header->entry_array_offset),
1968 le64toh(f->header->n_entries),
1977 if (direction == DIRECTION_DOWN) {
1990 /* And jump to it */
1991 r = generic_array_get(f,
1992 le64toh(f->header->entry_array_offset),
1999 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2000 log_debug("%s: entry array corrupted at entry %"PRIu64,
2011 int journal_file_skip_entry(
2013 Object *o, uint64_t p,
2015 Object **ret, uint64_t *offset) {
2024 if (o->object.type != OBJECT_ENTRY)
2027 r = generic_array_bisect(f,
2028 le64toh(f->header->entry_array_offset),
2029 le64toh(f->header->n_entries),
2038 /* Calculate new index */
2040 if ((uint64_t) -skip >= i)
2043 i = i - (uint64_t) -skip;
2045 i += (uint64_t) skip;
2047 n = le64toh(f->header->n_entries);
2054 return generic_array_get(f,
2055 le64toh(f->header->entry_array_offset),
2060 int journal_file_next_entry_for_data(
2062 Object *o, uint64_t p,
2063 uint64_t data_offset,
2064 direction_t direction,
2065 Object **ret, uint64_t *offset) {
2072 assert(p > 0 || !o);
2074 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2078 n = le64toh(d->data.n_entries);
2083 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2085 if (o->object.type != OBJECT_ENTRY)
2088 r = generic_array_bisect_plus_one(f,
2089 le64toh(d->data.entry_offset),
2090 le64toh(d->data.entry_array_offset),
2091 le64toh(d->data.n_entries),
2101 if (direction == DIRECTION_DOWN) {
2115 return generic_array_get_plus_one(f,
2116 le64toh(d->data.entry_offset),
2117 le64toh(d->data.entry_array_offset),
2122 int journal_file_move_to_entry_by_offset_for_data(
2124 uint64_t data_offset,
2126 direction_t direction,
2127 Object **ret, uint64_t *offset) {
2134 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2138 return generic_array_bisect_plus_one(f,
2139 le64toh(d->data.entry_offset),
2140 le64toh(d->data.entry_array_offset),
2141 le64toh(d->data.n_entries),
2148 int journal_file_move_to_entry_by_monotonic_for_data(
2150 uint64_t data_offset,
2153 direction_t direction,
2154 Object **ret, uint64_t *offset) {
2162 /* First, seek by time */
2163 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2169 r = generic_array_bisect_plus_one(f,
2170 le64toh(o->data.entry_offset),
2171 le64toh(o->data.entry_array_offset),
2172 le64toh(o->data.n_entries),
2174 test_object_monotonic,
2180 /* And now, continue seeking until we find an entry that
2181 * exists in both bisection arrays */
2187 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2191 r = generic_array_bisect_plus_one(f,
2192 le64toh(d->data.entry_offset),
2193 le64toh(d->data.entry_array_offset),
2194 le64toh(d->data.n_entries),
2202 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2206 r = generic_array_bisect_plus_one(f,
2207 le64toh(o->data.entry_offset),
2208 le64toh(o->data.entry_array_offset),
2209 le64toh(o->data.n_entries),
2231 int journal_file_move_to_entry_by_seqnum_for_data(
2233 uint64_t data_offset,
2235 direction_t direction,
2236 Object **ret, uint64_t *offset) {
2243 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2247 return generic_array_bisect_plus_one(f,
2248 le64toh(d->data.entry_offset),
2249 le64toh(d->data.entry_array_offset),
2250 le64toh(d->data.n_entries),
2257 int journal_file_move_to_entry_by_realtime_for_data(
2259 uint64_t data_offset,
2261 direction_t direction,
2262 Object **ret, uint64_t *offset) {
2269 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2273 return generic_array_bisect_plus_one(f,
2274 le64toh(d->data.entry_offset),
2275 le64toh(d->data.entry_array_offset),
2276 le64toh(d->data.n_entries),
2278 test_object_realtime,
2283 void journal_file_dump(JournalFile *f) {
2290 journal_file_print_header(f);
2292 p = le64toh(f->header->header_size);
2294 r = journal_file_move_to_object(f, -1, p, &o);
2298 switch (o->object.type) {
2301 printf("Type: OBJECT_UNUSED\n");
2305 printf("Type: OBJECT_DATA\n");
2309 printf("Type: OBJECT_FIELD\n");
2313 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2314 le64toh(o->entry.seqnum),
2315 le64toh(o->entry.monotonic),
2316 le64toh(o->entry.realtime));
2319 case OBJECT_FIELD_HASH_TABLE:
2320 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2323 case OBJECT_DATA_HASH_TABLE:
2324 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2327 case OBJECT_ENTRY_ARRAY:
2328 printf("Type: OBJECT_ENTRY_ARRAY\n");
2332 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2333 le64toh(o->tag.seqnum),
2334 le64toh(o->tag.epoch));
2338 printf("Type: unknown (%u)\n", o->object.type);
2342 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2343 printf("Flags: %s\n",
2344 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2346 if (p == le64toh(f->header->tail_object_offset))
2349 p = p + ALIGN64(le64toh(o->object.size));
2354 log_error("File corrupt");
2357 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2360 x = format_timestamp(buf, l, t);
2366 void journal_file_print_header(JournalFile *f) {
2367 char a[33], b[33], c[33], d[33];
2368 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2370 char bytes[FORMAT_BYTES_MAX];
2374 printf("File Path: %s\n"
2378 "Sequential Number ID: %s\n"
2380 "Compatible Flags:%s%s\n"
2381 "Incompatible Flags:%s%s%s\n"
2382 "Header size: %"PRIu64"\n"
2383 "Arena size: %"PRIu64"\n"
2384 "Data Hash Table Size: %"PRIu64"\n"
2385 "Field Hash Table Size: %"PRIu64"\n"
2386 "Rotate Suggested: %s\n"
2387 "Head Sequential Number: %"PRIu64"\n"
2388 "Tail Sequential Number: %"PRIu64"\n"
2389 "Head Realtime Timestamp: %s\n"
2390 "Tail Realtime Timestamp: %s\n"
2391 "Tail Monotonic Timestamp: %s\n"
2392 "Objects: %"PRIu64"\n"
2393 "Entry Objects: %"PRIu64"\n",
2395 sd_id128_to_string(f->header->file_id, a),
2396 sd_id128_to_string(f->header->machine_id, b),
2397 sd_id128_to_string(f->header->boot_id, c),
2398 sd_id128_to_string(f->header->seqnum_id, d),
2399 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2400 f->header->state == STATE_ONLINE ? "ONLINE" :
2401 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2402 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2403 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2404 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2405 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2406 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2407 le64toh(f->header->header_size),
2408 le64toh(f->header->arena_size),
2409 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2410 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2411 yes_no(journal_file_rotate_suggested(f, 0)),
2412 le64toh(f->header->head_entry_seqnum),
2413 le64toh(f->header->tail_entry_seqnum),
2414 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2415 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2416 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2417 le64toh(f->header->n_objects),
2418 le64toh(f->header->n_entries));
2420 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2421 printf("Data Objects: %"PRIu64"\n"
2422 "Data Hash Table Fill: %.1f%%\n",
2423 le64toh(f->header->n_data),
2424 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2427 printf("Field Objects: %"PRIu64"\n"
2428 "Field Hash Table Fill: %.1f%%\n",
2429 le64toh(f->header->n_fields),
2430 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2432 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2433 printf("Tag Objects: %"PRIu64"\n",
2434 le64toh(f->header->n_tags));
2435 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2436 printf("Entry Array Objects: %"PRIu64"\n",
2437 le64toh(f->header->n_entry_arrays));
2439 if (fstat(f->fd, &st) >= 0)
2440 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2443 int journal_file_open(
2449 JournalMetrics *metrics,
2450 MMapCache *mmap_cache,
2451 JournalFile *template,
2452 JournalFile **ret) {
2456 bool newly_created = false;
2461 if ((flags & O_ACCMODE) != O_RDONLY &&
2462 (flags & O_ACCMODE) != O_RDWR)
2465 if (!endswith(fname, ".journal") &&
2466 !endswith(fname, ".journal~"))
2469 f = new0(JournalFile, 1);
2477 f->prot = prot_from_flags(flags);
2478 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2479 #if defined(HAVE_LZ)
2480 f->compress_lz4 = compress;
2481 #elif defined(HAVE_XZ)
2482 f->compress_xz = compress;
2489 f->mmap = mmap_cache_ref(mmap_cache);
2491 f->mmap = mmap_cache_new();
2498 f->path = strdup(fname);
2504 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2505 if (!f->chain_cache) {
2510 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2516 if (fstat(f->fd, &f->last_stat) < 0) {
2521 if (f->last_stat.st_size == 0 && f->writable) {
2524 /* Let's attach the creation time to the journal file,
2525 * so that the vacuuming code knows the age of this
2526 * file even if the file might end up corrupted one
2527 * day... Ideally we'd just use the creation time many
2528 * file systems maintain for each file, but there is
2529 * currently no usable API to query this, hence let's
2530 * emulate this via extended attributes. If extended
2531 * attributes are not supported we'll just skip this,
2532 * and rely solely on mtime/atime/ctime of the file.*/
2534 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2535 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2538 /* Try to load the FSPRG state, and if we can't, then
2539 * just don't do sealing */
2541 r = journal_file_fss_load(f);
2547 r = journal_file_init_header(f, template);
2551 if (fstat(f->fd, &f->last_stat) < 0) {
2556 newly_created = true;
2559 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2564 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2565 if (f->header == MAP_FAILED) {
2571 if (!newly_created) {
2572 r = journal_file_verify_header(f);
2578 if (!newly_created && f->writable) {
2579 r = journal_file_fss_load(f);
2587 journal_default_metrics(metrics, f->fd);
2588 f->metrics = *metrics;
2589 } else if (template)
2590 f->metrics = template->metrics;
2592 r = journal_file_refresh_header(f);
2598 r = journal_file_hmac_setup(f);
2603 if (newly_created) {
2604 r = journal_file_setup_field_hash_table(f);
2608 r = journal_file_setup_data_hash_table(f);
2613 r = journal_file_append_first_tag(f);
2619 r = journal_file_map_field_hash_table(f);
2623 r = journal_file_map_data_hash_table(f);
2631 journal_file_close(f);
2636 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2637 _cleanup_free_ char *p = NULL;
2639 JournalFile *old_file, *new_file = NULL;
2647 if (!old_file->writable)
2650 if (!endswith(old_file->path, ".journal"))
2653 l = strlen(old_file->path);
2654 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2655 (int) l - 8, old_file->path,
2656 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2657 le64toh((*f)->header->head_entry_seqnum),
2658 le64toh((*f)->header->head_entry_realtime));
2662 r = rename(old_file->path, p);
2666 old_file->header->state = STATE_ARCHIVED;
2668 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2669 journal_file_close(old_file);
2675 int journal_file_open_reliably(
2681 JournalMetrics *metrics,
2682 MMapCache *mmap_cache,
2683 JournalFile *template,
2684 JournalFile **ret) {
2688 _cleanup_free_ char *p = NULL;
2690 r = journal_file_open(fname, flags, mode, compress, seal,
2691 metrics, mmap_cache, template, ret);
2692 if (r != -EBADMSG && /* corrupted */
2693 r != -ENODATA && /* truncated */
2694 r != -EHOSTDOWN && /* other machine */
2695 r != -EPROTONOSUPPORT && /* incompatible feature */
2696 r != -EBUSY && /* unclean shutdown */
2697 r != -ESHUTDOWN /* already archived */)
2700 if ((flags & O_ACCMODE) == O_RDONLY)
2703 if (!(flags & O_CREAT))
2706 if (!endswith(fname, ".journal"))
2709 /* The file is corrupted. Rotate it away and try it again (but only once) */
2712 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2714 (unsigned long long) now(CLOCK_REALTIME),
2718 r = rename(fname, p);
2722 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2724 return journal_file_open(fname, flags, mode, compress, seal,
2725 metrics, mmap_cache, template, ret);
2728 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2730 uint64_t q, xor_hash = 0;
2743 ts.monotonic = le64toh(o->entry.monotonic);
2744 ts.realtime = le64toh(o->entry.realtime);
2746 n = journal_file_entry_n_items(o);
2747 /* alloca() can't take 0, hence let's allocate at least one */
2748 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2750 for (i = 0; i < n; i++) {
2757 q = le64toh(o->entry.items[i].object_offset);
2758 le_hash = o->entry.items[i].hash;
2760 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2764 if (le_hash != o->data.hash)
2767 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2770 /* We hit the limit on 32bit machines */
2771 if ((uint64_t) t != l)
2774 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2777 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2778 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2782 data = from->compress_buffer;
2785 data = o->data.payload;
2787 r = journal_file_append_data(to, data, l, &u, &h);
2791 xor_hash ^= le64toh(u->data.hash);
2792 items[i].object_offset = htole64(h);
2793 items[i].hash = u->data.hash;
2795 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2800 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2803 void journal_default_metrics(JournalMetrics *m, int fd) {
2804 uint64_t fs_size = 0;
2806 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2811 if (fstatvfs(fd, &ss) >= 0)
2812 fs_size = ss.f_frsize * ss.f_blocks;
2814 if (m->max_use == (uint64_t) -1) {
2817 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2819 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2820 m->max_use = DEFAULT_MAX_USE_UPPER;
2822 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2823 m->max_use = DEFAULT_MAX_USE_LOWER;
2825 m->max_use = DEFAULT_MAX_USE_LOWER;
2827 m->max_use = PAGE_ALIGN(m->max_use);
2829 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2830 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2833 if (m->max_size == (uint64_t) -1) {
2834 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2836 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2837 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2839 m->max_size = PAGE_ALIGN(m->max_size);
2841 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2842 m->max_size = JOURNAL_FILE_SIZE_MIN;
2844 if (m->max_size*2 > m->max_use)
2845 m->max_use = m->max_size*2;
2847 if (m->min_size == (uint64_t) -1)
2848 m->min_size = JOURNAL_FILE_SIZE_MIN;
2850 m->min_size = PAGE_ALIGN(m->min_size);
2852 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2853 m->min_size = JOURNAL_FILE_SIZE_MIN;
2855 if (m->min_size > m->max_size)
2856 m->max_size = m->min_size;
2859 if (m->keep_free == (uint64_t) -1) {
2862 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2864 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2865 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2868 m->keep_free = DEFAULT_KEEP_FREE;
2871 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2872 format_bytes(a, sizeof(a), m->max_use),
2873 format_bytes(b, sizeof(b), m->max_size),
2874 format_bytes(c, sizeof(c), m->min_size),
2875 format_bytes(d, sizeof(d), m->keep_free));
2878 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2883 if (f->header->head_entry_realtime == 0)
2886 *from = le64toh(f->header->head_entry_realtime);
2890 if (f->header->tail_entry_realtime == 0)
2893 *to = le64toh(f->header->tail_entry_realtime);
2899 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2907 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2911 if (le64toh(o->data.n_entries) <= 0)
2915 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2919 *from = le64toh(o->entry.monotonic);
2923 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2927 r = generic_array_get_plus_one(f,
2928 le64toh(o->data.entry_offset),
2929 le64toh(o->data.entry_array_offset),
2930 le64toh(o->data.n_entries)-1,
2935 *to = le64toh(o->entry.monotonic);
2941 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2944 /* If we gained new header fields we gained new features,
2945 * hence suggest a rotation */
2946 if (le64toh(f->header->header_size) < sizeof(Header)) {
2947 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2951 /* Let's check if the hash tables grew over a certain fill
2952 * level (75%, borrowing this value from Java's hash table
2953 * implementation), and if so suggest a rotation. To calculate
2954 * the fill level we need the n_data field, which only exists
2955 * in newer versions. */
2957 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2958 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2959 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2961 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2962 le64toh(f->header->n_data),
2963 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2964 (unsigned long long) f->last_stat.st_size,
2965 f->last_stat.st_size / le64toh(f->header->n_data));
2969 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2970 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2971 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2973 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2974 le64toh(f->header->n_fields),
2975 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2979 /* Are the data objects properly indexed by field objects? */
2980 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2981 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2982 le64toh(f->header->n_data) > 0 &&
2983 le64toh(f->header->n_fields) == 0)
2986 if (max_file_usec > 0) {
2989 h = le64toh(f->header->head_entry_realtime);
2990 t = now(CLOCK_REALTIME);
2992 if (h > 0 && t > h + max_file_usec)