1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
29 #include <sys/xattr.h>
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 static int journal_file_set_online(JournalFile *f) {
77 if (!(f->fd >= 0 && f->header))
80 switch(f->header->state) {
85 f->header->state = STATE_ONLINE;
94 int journal_file_set_offline(JournalFile *f) {
100 if (!(f->fd >= 0 && f->header))
103 if (f->header->state != STATE_ONLINE)
108 f->header->state = STATE_OFFLINE;
115 void journal_file_close(JournalFile *f) {
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
128 journal_file_set_offline(f);
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 mmap_cache_unref(f->mmap);
139 hashmap_free_free(f->chain_cache);
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142 free(f->compress_buffer);
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
154 gcry_md_close(f->hmac);
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
167 memcpy(h.signature, HEADER_SIGNATURE, 8);
168 h.header_size = htole64(ALIGN64(sizeof(h)));
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
177 r = sd_id128_randomize(&h.file_id);
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
185 h.seqnum_id = h.file_id;
187 k = pwrite(f->fd, &h, sizeof(h), 0);
197 static int journal_file_refresh_header(JournalFile *f) {
203 r = sd_id128_get_machine(&f->header->machine_id);
207 r = sd_id128_get_boot(&boot_id);
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
214 f->header->boot_id = boot_id;
216 journal_file_set_online(f);
218 /* Sync the online state to disk */
224 static int journal_file_verify_header(JournalFile *f) {
229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
243 return -EPROTONOSUPPORT;
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
260 if (f->header->state >= _STATE_MAX)
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
282 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
283 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
284 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
285 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
290 sd_id128_t machine_id;
293 r = sd_id128_get_machine(&machine_id);
297 if (!sd_id128_equal(machine_id, f->header->machine_id))
300 state = f->header->state;
302 if (state == STATE_ONLINE) {
303 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
305 } else if (state == STATE_ARCHIVED)
307 else if (state != STATE_OFFLINE) {
308 log_debug("Journal file %s has unknown state %u.", f->path, state);
313 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
314 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
316 f->seal = JOURNAL_HEADER_SEALED(f->header);
321 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
322 uint64_t old_size, new_size;
327 /* We assume that this file is not sparse, and we know that
328 * for sure, since we always call posix_fallocate()
332 le64toh(f->header->header_size) +
333 le64toh(f->header->arena_size);
335 new_size = PAGE_ALIGN(offset + size);
336 if (new_size < le64toh(f->header->header_size))
337 new_size = le64toh(f->header->header_size);
339 if (new_size <= old_size)
342 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
345 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
348 if (fstatvfs(f->fd, &svfs) >= 0) {
351 available = svfs.f_bfree * svfs.f_bsize;
353 if (available >= f->metrics.keep_free)
354 available -= f->metrics.keep_free;
358 if (new_size - old_size > available)
363 /* Increase by larger blocks at once */
364 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
365 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
366 new_size = f->metrics.max_size;
368 /* Note that the glibc fallocate() fallback is very
369 inefficient, hence we try to minimize the allocation area
371 r = posix_fallocate(f->fd, old_size, new_size - old_size);
375 if (fstat(f->fd, &f->last_stat) < 0)
378 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
383 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
390 /* Avoid SIGBUS on invalid accesses */
391 if (offset + size > (uint64_t) f->last_stat.st_size) {
392 /* Hmm, out of range? Let's refresh the fstat() data
393 * first, before we trust that check. */
395 if (fstat(f->fd, &f->last_stat) < 0 ||
396 offset + size > (uint64_t) f->last_stat.st_size)
397 return -EADDRNOTAVAIL;
400 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
403 static uint64_t minimum_header_size(Object *o) {
405 static const uint64_t table[] = {
406 [OBJECT_DATA] = sizeof(DataObject),
407 [OBJECT_FIELD] = sizeof(FieldObject),
408 [OBJECT_ENTRY] = sizeof(EntryObject),
409 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
411 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
412 [OBJECT_TAG] = sizeof(TagObject),
415 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
416 return sizeof(ObjectHeader);
418 return table[o->object.type];
421 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
435 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
440 s = le64toh(o->object.size);
442 if (s < sizeof(ObjectHeader))
445 if (o->object.type <= OBJECT_UNUSED)
448 if (s < minimum_header_size(o))
451 if (type > 0 && o->object.type != type)
454 if (s > sizeof(ObjectHeader)) {
455 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
466 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
471 r = le64toh(f->header->tail_entry_seqnum) + 1;
474 /* If an external seqnum counter was passed, we update
475 * both the local and the external one, and set it to
476 * the maximum of both */
484 f->header->tail_entry_seqnum = htole64(r);
486 if (f->header->head_entry_seqnum == 0)
487 f->header->head_entry_seqnum = htole64(r);
492 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
499 assert(type > 0 && type < _OBJECT_TYPE_MAX);
500 assert(size >= sizeof(ObjectHeader));
504 r = journal_file_set_online(f);
508 p = le64toh(f->header->tail_object_offset);
510 p = le64toh(f->header->header_size);
512 r = journal_file_move_to_object(f, -1, p, &tail);
516 p += ALIGN64(le64toh(tail->object.size));
519 r = journal_file_allocate(f, p, size);
523 r = journal_file_move_to(f, type, false, p, size, &t);
530 o->object.type = type;
531 o->object.size = htole64(size);
533 f->header->tail_object_offset = htole64(p);
534 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
542 static int journal_file_setup_data_hash_table(JournalFile *f) {
549 /* We estimate that we need 1 hash table entry per 768 of
550 journal file and we want to make sure we never get beyond
551 75% fill level. Calculate the hash table size for the
552 maximum file size based on these metrics. */
554 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
555 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
556 s = DEFAULT_DATA_HASH_TABLE_SIZE;
558 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
560 r = journal_file_append_object(f,
561 OBJECT_DATA_HASH_TABLE,
562 offsetof(Object, hash_table.items) + s,
567 memzero(o->hash_table.items, s);
569 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
570 f->header->data_hash_table_size = htole64(s);
575 static int journal_file_setup_field_hash_table(JournalFile *f) {
582 /* We use a fixed size hash table for the fields as this
583 * number should grow very slowly only */
585 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
586 r = journal_file_append_object(f,
587 OBJECT_FIELD_HASH_TABLE,
588 offsetof(Object, hash_table.items) + s,
593 memzero(o->hash_table.items, s);
595 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
596 f->header->field_hash_table_size = htole64(s);
601 static int journal_file_map_data_hash_table(JournalFile *f) {
608 p = le64toh(f->header->data_hash_table_offset);
609 s = le64toh(f->header->data_hash_table_size);
611 r = journal_file_move_to(f,
612 OBJECT_DATA_HASH_TABLE,
619 f->data_hash_table = t;
623 static int journal_file_map_field_hash_table(JournalFile *f) {
630 p = le64toh(f->header->field_hash_table_offset);
631 s = le64toh(f->header->field_hash_table_size);
633 r = journal_file_move_to(f,
634 OBJECT_FIELD_HASH_TABLE,
641 f->field_hash_table = t;
645 static int journal_file_link_field(
658 if (o->object.type != OBJECT_FIELD)
661 /* This might alter the window we are looking at */
663 o->field.next_hash_offset = o->field.head_data_offset = 0;
665 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
666 p = le64toh(f->field_hash_table[h].tail_hash_offset);
668 f->field_hash_table[h].head_hash_offset = htole64(offset);
670 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
674 o->field.next_hash_offset = htole64(offset);
677 f->field_hash_table[h].tail_hash_offset = htole64(offset);
679 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
680 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
685 static int journal_file_link_data(
698 if (o->object.type != OBJECT_DATA)
701 /* This might alter the window we are looking at */
703 o->data.next_hash_offset = o->data.next_field_offset = 0;
704 o->data.entry_offset = o->data.entry_array_offset = 0;
705 o->data.n_entries = 0;
707 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
708 p = le64toh(f->data_hash_table[h].tail_hash_offset);
710 /* Only entry in the hash table is easy */
711 f->data_hash_table[h].head_hash_offset = htole64(offset);
713 /* Move back to the previous data object, to patch in
716 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
720 o->data.next_hash_offset = htole64(offset);
723 f->data_hash_table[h].tail_hash_offset = htole64(offset);
725 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
726 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
731 int journal_file_find_field_object_with_hash(
733 const void *field, uint64_t size, uint64_t hash,
734 Object **ret, uint64_t *offset) {
736 uint64_t p, osize, h;
740 assert(field && size > 0);
742 osize = offsetof(Object, field.payload) + size;
744 if (f->header->field_hash_table_size == 0)
747 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
748 p = le64toh(f->field_hash_table[h].head_hash_offset);
753 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
757 if (le64toh(o->field.hash) == hash &&
758 le64toh(o->object.size) == osize &&
759 memcmp(o->field.payload, field, size) == 0) {
769 p = le64toh(o->field.next_hash_offset);
775 int journal_file_find_field_object(
777 const void *field, uint64_t size,
778 Object **ret, uint64_t *offset) {
783 assert(field && size > 0);
785 hash = hash64(field, size);
787 return journal_file_find_field_object_with_hash(f,
792 int journal_file_find_data_object_with_hash(
794 const void *data, uint64_t size, uint64_t hash,
795 Object **ret, uint64_t *offset) {
797 uint64_t p, osize, h;
801 assert(data || size == 0);
803 osize = offsetof(Object, data.payload) + size;
805 if (f->header->data_hash_table_size == 0)
808 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
809 p = le64toh(f->data_hash_table[h].head_hash_offset);
814 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
818 if (le64toh(o->data.hash) != hash)
821 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
822 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
825 l = le64toh(o->object.size);
826 if (l <= offsetof(Object, data.payload))
829 l -= offsetof(Object, data.payload);
831 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
832 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
837 memcmp(f->compress_buffer, data, size) == 0) {
848 return -EPROTONOSUPPORT;
850 } else if (le64toh(o->object.size) == osize &&
851 memcmp(o->data.payload, data, size) == 0) {
863 p = le64toh(o->data.next_hash_offset);
869 int journal_file_find_data_object(
871 const void *data, uint64_t size,
872 Object **ret, uint64_t *offset) {
877 assert(data || size == 0);
879 hash = hash64(data, size);
881 return journal_file_find_data_object_with_hash(f,
886 static int journal_file_append_field(
888 const void *field, uint64_t size,
889 Object **ret, uint64_t *offset) {
897 assert(field && size > 0);
899 hash = hash64(field, size);
901 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
915 osize = offsetof(Object, field.payload) + size;
916 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
920 o->field.hash = htole64(hash);
921 memcpy(o->field.payload, field, size);
923 r = journal_file_link_field(f, o, p, hash);
927 /* The linking might have altered the window, so let's
928 * refresh our pointer */
929 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
934 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
948 static int journal_file_append_data(
950 const void *data, uint64_t size,
951 Object **ret, uint64_t *offset) {
956 int r, compression = 0;
960 assert(data || size == 0);
962 hash = hash64(data, size);
964 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
978 osize = offsetof(Object, data.payload) + size;
979 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
983 o->data.hash = htole64(hash);
985 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
986 if (f->compress_xz &&
987 size >= COMPRESSION_SIZE_THRESHOLD) {
990 compression = compress_blob(data, size, o->data.payload, &rsize);
993 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
994 o->object.flags |= compression;
996 log_debug("Compressed data object %"PRIu64" -> %"PRIu64" using %s",
997 size, rsize, object_compressed_to_string(compression));
1002 if (!compression && size > 0)
1003 memcpy(o->data.payload, data, size);
1005 r = journal_file_link_data(f, o, p, hash);
1009 /* The linking might have altered the window, so let's
1010 * refresh our pointer */
1011 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1018 eq = memchr(data, '=', size);
1019 if (eq && eq > data) {
1023 /* Create field object ... */
1024 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1028 /* ... and link it in. */
1029 o->data.next_field_offset = fo->field.head_data_offset;
1030 fo->field.head_data_offset = le64toh(p);
1034 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1048 uint64_t journal_file_entry_n_items(Object *o) {
1051 if (o->object.type != OBJECT_ENTRY)
1054 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1057 uint64_t journal_file_entry_array_n_items(Object *o) {
1060 if (o->object.type != OBJECT_ENTRY_ARRAY)
1063 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1066 uint64_t journal_file_hash_table_n_items(Object *o) {
1069 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1070 o->object.type != OBJECT_FIELD_HASH_TABLE)
1073 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1076 static int link_entry_into_array(JournalFile *f,
1081 uint64_t n = 0, ap = 0, q, i, a, hidx;
1089 a = le64toh(*first);
1090 i = hidx = le64toh(*idx);
1093 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1097 n = journal_file_entry_array_n_items(o);
1099 o->entry_array.items[i] = htole64(p);
1100 *idx = htole64(hidx + 1);
1106 a = le64toh(o->entry_array.next_entry_array_offset);
1117 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1118 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1124 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1129 o->entry_array.items[i] = htole64(p);
1132 *first = htole64(q);
1134 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1138 o->entry_array.next_entry_array_offset = htole64(q);
1141 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1142 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1144 *idx = htole64(hidx + 1);
1149 static int link_entry_into_array_plus_one(JournalFile *f,
1164 *extra = htole64(p);
1168 i = htole64(le64toh(*idx) - 1);
1169 r = link_entry_into_array(f, first, &i, p);
1174 *idx = htole64(le64toh(*idx) + 1);
1178 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1185 p = le64toh(o->entry.items[i].object_offset);
1189 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1193 return link_entry_into_array_plus_one(f,
1194 &o->data.entry_offset,
1195 &o->data.entry_array_offset,
1200 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1208 if (o->object.type != OBJECT_ENTRY)
1211 __sync_synchronize();
1213 /* Link up the entry itself */
1214 r = link_entry_into_array(f,
1215 &f->header->entry_array_offset,
1216 &f->header->n_entries,
1221 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1223 if (f->header->head_entry_realtime == 0)
1224 f->header->head_entry_realtime = o->entry.realtime;
1226 f->header->tail_entry_realtime = o->entry.realtime;
1227 f->header->tail_entry_monotonic = o->entry.monotonic;
1229 f->tail_entry_monotonic_valid = true;
1231 /* Link up the items */
1232 n = journal_file_entry_n_items(o);
1233 for (i = 0; i < n; i++) {
1234 r = journal_file_link_entry_item(f, o, offset, i);
1242 static int journal_file_append_entry_internal(
1244 const dual_timestamp *ts,
1246 const EntryItem items[], unsigned n_items,
1248 Object **ret, uint64_t *offset) {
1255 assert(items || n_items == 0);
1258 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1260 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1264 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1265 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1266 o->entry.realtime = htole64(ts->realtime);
1267 o->entry.monotonic = htole64(ts->monotonic);
1268 o->entry.xor_hash = htole64(xor_hash);
1269 o->entry.boot_id = f->header->boot_id;
1272 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1277 r = journal_file_link_entry(f, o, np);
1290 void journal_file_post_change(JournalFile *f) {
1293 /* inotify() does not receive IN_MODIFY events from file
1294 * accesses done via mmap(). After each access we hence
1295 * trigger IN_MODIFY by truncating the journal file to its
1296 * current size which triggers IN_MODIFY. */
1298 __sync_synchronize();
1300 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1301 log_error("Failed to truncate file to its own size: %m");
1304 static int entry_item_cmp(const void *_a, const void *_b) {
1305 const EntryItem *a = _a, *b = _b;
1307 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1309 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1314 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1318 uint64_t xor_hash = 0;
1319 struct dual_timestamp _ts;
1322 assert(iovec || n_iovec == 0);
1325 dual_timestamp_get(&_ts);
1329 if (f->tail_entry_monotonic_valid &&
1330 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1334 r = journal_file_maybe_append_tag(f, ts->realtime);
1339 /* alloca() can't take 0, hence let's allocate at least one */
1340 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1342 for (i = 0; i < n_iovec; i++) {
1346 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1350 xor_hash ^= le64toh(o->data.hash);
1351 items[i].object_offset = htole64(p);
1352 items[i].hash = o->data.hash;
1355 /* Order by the position on disk, in order to improve seek
1356 * times for rotating media. */
1357 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1359 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1361 journal_file_post_change(f);
1366 typedef struct ChainCacheItem {
1367 uint64_t first; /* the array at the beginning of the chain */
1368 uint64_t array; /* the cached array */
1369 uint64_t begin; /* the first item in the cached array */
1370 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1371 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1374 static void chain_cache_put(
1381 uint64_t last_index) {
1384 /* If the chain item to cache for this chain is the
1385 * first one it's not worth caching anything */
1389 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1390 ci = hashmap_steal_first(h);
1392 ci = new(ChainCacheItem, 1);
1399 if (hashmap_put(h, &ci->first, ci) < 0) {
1404 assert(ci->first == first);
1409 ci->last_index = last_index;
1412 static int generic_array_get(
1416 Object **ret, uint64_t *offset) {
1419 uint64_t p = 0, a, t = 0;
1427 /* Try the chain cache first */
1428 ci = hashmap_get(f->chain_cache, &first);
1429 if (ci && i > ci->total) {
1438 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1442 k = journal_file_entry_array_n_items(o);
1444 p = le64toh(o->entry_array.items[i]);
1450 a = le64toh(o->entry_array.next_entry_array_offset);
1456 /* Let's cache this item for the next invocation */
1457 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1459 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1472 static int generic_array_get_plus_one(
1477 Object **ret, uint64_t *offset) {
1486 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1499 return generic_array_get(f, first, i-1, ret, offset);
1508 static int generic_array_bisect(
1513 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1514 direction_t direction,
1519 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1520 bool subtract_one = false;
1521 Object *o, *array = NULL;
1526 assert(test_object);
1528 /* Start with the first array in the chain */
1531 ci = hashmap_get(f->chain_cache, &first);
1532 if (ci && n > ci->total) {
1533 /* Ah, we have iterated this bisection array chain
1534 * previously! Let's see if we can skip ahead in the
1535 * chain, as far as the last time. But we can't jump
1536 * backwards in the chain, so let's check that
1539 r = test_object(f, ci->begin, needle);
1543 if (r == TEST_LEFT) {
1544 /* OK, what we are looking for is right of the
1545 * begin of this EntryArray, so let's jump
1546 * straight to previously cached array in the
1552 last_index = ci->last_index;
1557 uint64_t left, right, k, lp;
1559 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1563 k = journal_file_entry_array_n_items(array);
1569 lp = p = le64toh(array->entry_array.items[i]);
1573 r = test_object(f, p, needle);
1577 if (r == TEST_FOUND)
1578 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1580 if (r == TEST_RIGHT) {
1584 if (last_index != (uint64_t) -1) {
1585 assert(last_index <= right);
1587 /* If we cached the last index we
1588 * looked at, let's try to not to jump
1589 * too wildly around and see if we can
1590 * limit the range to look at early to
1591 * the immediate neighbors of the last
1592 * index we looked at. */
1594 if (last_index > 0) {
1595 uint64_t x = last_index - 1;
1597 p = le64toh(array->entry_array.items[x]);
1601 r = test_object(f, p, needle);
1605 if (r == TEST_FOUND)
1606 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1608 if (r == TEST_RIGHT)
1614 if (last_index < right) {
1615 uint64_t y = last_index + 1;
1617 p = le64toh(array->entry_array.items[y]);
1621 r = test_object(f, p, needle);
1625 if (r == TEST_FOUND)
1626 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1628 if (r == TEST_RIGHT)
1636 if (left == right) {
1637 if (direction == DIRECTION_UP)
1638 subtract_one = true;
1644 assert(left < right);
1645 i = (left + right) / 2;
1647 p = le64toh(array->entry_array.items[i]);
1651 r = test_object(f, p, needle);
1655 if (r == TEST_FOUND)
1656 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1658 if (r == TEST_RIGHT)
1666 if (direction == DIRECTION_UP) {
1668 subtract_one = true;
1679 last_index = (uint64_t) -1;
1680 a = le64toh(array->entry_array.next_entry_array_offset);
1686 if (subtract_one && t == 0 && i == 0)
1689 /* Let's cache this item for the next invocation */
1690 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1692 if (subtract_one && i == 0)
1694 else if (subtract_one)
1695 p = le64toh(array->entry_array.items[i-1]);
1697 p = le64toh(array->entry_array.items[i]);
1699 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1710 *idx = t + i + (subtract_one ? -1 : 0);
1716 static int generic_array_bisect_plus_one(
1722 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723 direction_t direction,
1729 bool step_back = false;
1733 assert(test_object);
1738 /* This bisects the array in object 'first', but first checks
1740 r = test_object(f, extra, needle);
1744 if (r == TEST_FOUND)
1745 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1747 /* if we are looking with DIRECTION_UP then we need to first
1748 see if in the actual array there is a matching entry, and
1749 return the last one of that. But if there isn't any we need
1750 to return this one. Hence remember this, and return it
1753 step_back = direction == DIRECTION_UP;
1755 if (r == TEST_RIGHT) {
1756 if (direction == DIRECTION_DOWN)
1762 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1764 if (r == 0 && step_back)
1773 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1789 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1795 else if (p < needle)
1801 int journal_file_move_to_entry_by_offset(
1804 direction_t direction,
1808 return generic_array_bisect(f,
1809 le64toh(f->header->entry_array_offset),
1810 le64toh(f->header->n_entries),
1818 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1825 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1829 if (le64toh(o->entry.seqnum) == needle)
1831 else if (le64toh(o->entry.seqnum) < needle)
1837 int journal_file_move_to_entry_by_seqnum(
1840 direction_t direction,
1844 return generic_array_bisect(f,
1845 le64toh(f->header->entry_array_offset),
1846 le64toh(f->header->n_entries),
1853 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1860 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1864 if (le64toh(o->entry.realtime) == needle)
1866 else if (le64toh(o->entry.realtime) < needle)
1872 int journal_file_move_to_entry_by_realtime(
1875 direction_t direction,
1879 return generic_array_bisect(f,
1880 le64toh(f->header->entry_array_offset),
1881 le64toh(f->header->n_entries),
1883 test_object_realtime,
1888 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1895 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1899 if (le64toh(o->entry.monotonic) == needle)
1901 else if (le64toh(o->entry.monotonic) < needle)
1907 static inline int find_data_object_by_boot_id(
1912 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1914 sd_id128_to_string(boot_id, t + 9);
1915 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1918 int journal_file_move_to_entry_by_monotonic(
1922 direction_t direction,
1931 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1937 return generic_array_bisect_plus_one(f,
1938 le64toh(o->data.entry_offset),
1939 le64toh(o->data.entry_array_offset),
1940 le64toh(o->data.n_entries),
1942 test_object_monotonic,
1947 int journal_file_next_entry(
1949 Object *o, uint64_t p,
1950 direction_t direction,
1951 Object **ret, uint64_t *offset) {
1957 assert(p > 0 || !o);
1959 n = le64toh(f->header->n_entries);
1964 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1966 if (o->object.type != OBJECT_ENTRY)
1969 r = generic_array_bisect(f,
1970 le64toh(f->header->entry_array_offset),
1971 le64toh(f->header->n_entries),
1980 if (direction == DIRECTION_DOWN) {
1993 /* And jump to it */
1994 r = generic_array_get(f,
1995 le64toh(f->header->entry_array_offset),
2002 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2003 log_debug("%s: entry array corrupted at entry %"PRIu64,
2014 int journal_file_skip_entry(
2016 Object *o, uint64_t p,
2018 Object **ret, uint64_t *offset) {
2027 if (o->object.type != OBJECT_ENTRY)
2030 r = generic_array_bisect(f,
2031 le64toh(f->header->entry_array_offset),
2032 le64toh(f->header->n_entries),
2041 /* Calculate new index */
2043 if ((uint64_t) -skip >= i)
2046 i = i - (uint64_t) -skip;
2048 i += (uint64_t) skip;
2050 n = le64toh(f->header->n_entries);
2057 return generic_array_get(f,
2058 le64toh(f->header->entry_array_offset),
2063 int journal_file_next_entry_for_data(
2065 Object *o, uint64_t p,
2066 uint64_t data_offset,
2067 direction_t direction,
2068 Object **ret, uint64_t *offset) {
2075 assert(p > 0 || !o);
2077 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2081 n = le64toh(d->data.n_entries);
2086 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2088 if (o->object.type != OBJECT_ENTRY)
2091 r = generic_array_bisect_plus_one(f,
2092 le64toh(d->data.entry_offset),
2093 le64toh(d->data.entry_array_offset),
2094 le64toh(d->data.n_entries),
2104 if (direction == DIRECTION_DOWN) {
2118 return generic_array_get_plus_one(f,
2119 le64toh(d->data.entry_offset),
2120 le64toh(d->data.entry_array_offset),
2125 int journal_file_move_to_entry_by_offset_for_data(
2127 uint64_t data_offset,
2129 direction_t direction,
2130 Object **ret, uint64_t *offset) {
2137 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2141 return generic_array_bisect_plus_one(f,
2142 le64toh(d->data.entry_offset),
2143 le64toh(d->data.entry_array_offset),
2144 le64toh(d->data.n_entries),
2151 int journal_file_move_to_entry_by_monotonic_for_data(
2153 uint64_t data_offset,
2156 direction_t direction,
2157 Object **ret, uint64_t *offset) {
2165 /* First, seek by time */
2166 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2172 r = generic_array_bisect_plus_one(f,
2173 le64toh(o->data.entry_offset),
2174 le64toh(o->data.entry_array_offset),
2175 le64toh(o->data.n_entries),
2177 test_object_monotonic,
2183 /* And now, continue seeking until we find an entry that
2184 * exists in both bisection arrays */
2190 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2194 r = generic_array_bisect_plus_one(f,
2195 le64toh(d->data.entry_offset),
2196 le64toh(d->data.entry_array_offset),
2197 le64toh(d->data.n_entries),
2205 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2209 r = generic_array_bisect_plus_one(f,
2210 le64toh(o->data.entry_offset),
2211 le64toh(o->data.entry_array_offset),
2212 le64toh(o->data.n_entries),
2234 int journal_file_move_to_entry_by_seqnum_for_data(
2236 uint64_t data_offset,
2238 direction_t direction,
2239 Object **ret, uint64_t *offset) {
2246 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2250 return generic_array_bisect_plus_one(f,
2251 le64toh(d->data.entry_offset),
2252 le64toh(d->data.entry_array_offset),
2253 le64toh(d->data.n_entries),
2260 int journal_file_move_to_entry_by_realtime_for_data(
2262 uint64_t data_offset,
2264 direction_t direction,
2265 Object **ret, uint64_t *offset) {
2272 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2276 return generic_array_bisect_plus_one(f,
2277 le64toh(d->data.entry_offset),
2278 le64toh(d->data.entry_array_offset),
2279 le64toh(d->data.n_entries),
2281 test_object_realtime,
2286 void journal_file_dump(JournalFile *f) {
2293 journal_file_print_header(f);
2295 p = le64toh(f->header->header_size);
2297 r = journal_file_move_to_object(f, -1, p, &o);
2301 switch (o->object.type) {
2304 printf("Type: OBJECT_UNUSED\n");
2308 printf("Type: OBJECT_DATA\n");
2312 printf("Type: OBJECT_FIELD\n");
2316 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2317 le64toh(o->entry.seqnum),
2318 le64toh(o->entry.monotonic),
2319 le64toh(o->entry.realtime));
2322 case OBJECT_FIELD_HASH_TABLE:
2323 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2326 case OBJECT_DATA_HASH_TABLE:
2327 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2330 case OBJECT_ENTRY_ARRAY:
2331 printf("Type: OBJECT_ENTRY_ARRAY\n");
2335 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2336 le64toh(o->tag.seqnum),
2337 le64toh(o->tag.epoch));
2341 printf("Type: unknown (%u)\n", o->object.type);
2345 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2346 printf("Flags: %s\n",
2347 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2349 if (p == le64toh(f->header->tail_object_offset))
2352 p = p + ALIGN64(le64toh(o->object.size));
2357 log_error("File corrupt");
2360 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2363 x = format_timestamp(buf, l, t);
2369 void journal_file_print_header(JournalFile *f) {
2370 char a[33], b[33], c[33], d[33];
2371 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2373 char bytes[FORMAT_BYTES_MAX];
2377 printf("File Path: %s\n"
2381 "Sequential Number ID: %s\n"
2383 "Compatible Flags:%s%s\n"
2384 "Incompatible Flags:%s%s%s\n"
2385 "Header size: %"PRIu64"\n"
2386 "Arena size: %"PRIu64"\n"
2387 "Data Hash Table Size: %"PRIu64"\n"
2388 "Field Hash Table Size: %"PRIu64"\n"
2389 "Rotate Suggested: %s\n"
2390 "Head Sequential Number: %"PRIu64"\n"
2391 "Tail Sequential Number: %"PRIu64"\n"
2392 "Head Realtime Timestamp: %s\n"
2393 "Tail Realtime Timestamp: %s\n"
2394 "Tail Monotonic Timestamp: %s\n"
2395 "Objects: %"PRIu64"\n"
2396 "Entry Objects: %"PRIu64"\n",
2398 sd_id128_to_string(f->header->file_id, a),
2399 sd_id128_to_string(f->header->machine_id, b),
2400 sd_id128_to_string(f->header->boot_id, c),
2401 sd_id128_to_string(f->header->seqnum_id, d),
2402 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2403 f->header->state == STATE_ONLINE ? "ONLINE" :
2404 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2405 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2406 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2407 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2408 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2409 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2410 le64toh(f->header->header_size),
2411 le64toh(f->header->arena_size),
2412 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2413 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2414 yes_no(journal_file_rotate_suggested(f, 0)),
2415 le64toh(f->header->head_entry_seqnum),
2416 le64toh(f->header->tail_entry_seqnum),
2417 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2418 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2419 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2420 le64toh(f->header->n_objects),
2421 le64toh(f->header->n_entries));
2423 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2424 printf("Data Objects: %"PRIu64"\n"
2425 "Data Hash Table Fill: %.1f%%\n",
2426 le64toh(f->header->n_data),
2427 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2429 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2430 printf("Field Objects: %"PRIu64"\n"
2431 "Field Hash Table Fill: %.1f%%\n",
2432 le64toh(f->header->n_fields),
2433 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2435 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2436 printf("Tag Objects: %"PRIu64"\n",
2437 le64toh(f->header->n_tags));
2438 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2439 printf("Entry Array Objects: %"PRIu64"\n",
2440 le64toh(f->header->n_entry_arrays));
2442 if (fstat(f->fd, &st) >= 0)
2443 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2446 int journal_file_open(
2452 JournalMetrics *metrics,
2453 MMapCache *mmap_cache,
2454 JournalFile *template,
2455 JournalFile **ret) {
2459 bool newly_created = false;
2464 if ((flags & O_ACCMODE) != O_RDONLY &&
2465 (flags & O_ACCMODE) != O_RDWR)
2468 if (!endswith(fname, ".journal") &&
2469 !endswith(fname, ".journal~"))
2472 f = new0(JournalFile, 1);
2480 f->prot = prot_from_flags(flags);
2481 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2482 #if defined(HAVE_LZ4)
2483 f->compress_lz4 = compress;
2484 #elif defined(HAVE_XZ)
2485 f->compress_xz = compress;
2492 f->mmap = mmap_cache_ref(mmap_cache);
2494 f->mmap = mmap_cache_new();
2501 f->path = strdup(fname);
2507 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2508 if (!f->chain_cache) {
2513 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2519 if (fstat(f->fd, &f->last_stat) < 0) {
2524 if (f->last_stat.st_size == 0 && f->writable) {
2527 /* Let's attach the creation time to the journal file,
2528 * so that the vacuuming code knows the age of this
2529 * file even if the file might end up corrupted one
2530 * day... Ideally we'd just use the creation time many
2531 * file systems maintain for each file, but there is
2532 * currently no usable API to query this, hence let's
2533 * emulate this via extended attributes. If extended
2534 * attributes are not supported we'll just skip this,
2535 * and rely solely on mtime/atime/ctime of the file.*/
2537 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2538 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2541 /* Try to load the FSPRG state, and if we can't, then
2542 * just don't do sealing */
2544 r = journal_file_fss_load(f);
2550 r = journal_file_init_header(f, template);
2554 if (fstat(f->fd, &f->last_stat) < 0) {
2559 newly_created = true;
2562 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2567 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2568 if (f->header == MAP_FAILED) {
2574 if (!newly_created) {
2575 r = journal_file_verify_header(f);
2581 if (!newly_created && f->writable) {
2582 r = journal_file_fss_load(f);
2590 journal_default_metrics(metrics, f->fd);
2591 f->metrics = *metrics;
2592 } else if (template)
2593 f->metrics = template->metrics;
2595 r = journal_file_refresh_header(f);
2601 r = journal_file_hmac_setup(f);
2606 if (newly_created) {
2607 r = journal_file_setup_field_hash_table(f);
2611 r = journal_file_setup_data_hash_table(f);
2616 r = journal_file_append_first_tag(f);
2622 r = journal_file_map_field_hash_table(f);
2626 r = journal_file_map_data_hash_table(f);
2634 journal_file_close(f);
2639 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2640 _cleanup_free_ char *p = NULL;
2642 JournalFile *old_file, *new_file = NULL;
2650 if (!old_file->writable)
2653 if (!endswith(old_file->path, ".journal"))
2656 l = strlen(old_file->path);
2657 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2658 (int) l - 8, old_file->path,
2659 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2660 le64toh((*f)->header->head_entry_seqnum),
2661 le64toh((*f)->header->head_entry_realtime));
2665 r = rename(old_file->path, p);
2669 old_file->header->state = STATE_ARCHIVED;
2671 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2672 journal_file_close(old_file);
2678 int journal_file_open_reliably(
2684 JournalMetrics *metrics,
2685 MMapCache *mmap_cache,
2686 JournalFile *template,
2687 JournalFile **ret) {
2691 _cleanup_free_ char *p = NULL;
2693 r = journal_file_open(fname, flags, mode, compress, seal,
2694 metrics, mmap_cache, template, ret);
2695 if (r != -EBADMSG && /* corrupted */
2696 r != -ENODATA && /* truncated */
2697 r != -EHOSTDOWN && /* other machine */
2698 r != -EPROTONOSUPPORT && /* incompatible feature */
2699 r != -EBUSY && /* unclean shutdown */
2700 r != -ESHUTDOWN /* already archived */)
2703 if ((flags & O_ACCMODE) == O_RDONLY)
2706 if (!(flags & O_CREAT))
2709 if (!endswith(fname, ".journal"))
2712 /* The file is corrupted. Rotate it away and try it again (but only once) */
2715 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2717 (unsigned long long) now(CLOCK_REALTIME),
2721 r = rename(fname, p);
2725 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2727 return journal_file_open(fname, flags, mode, compress, seal,
2728 metrics, mmap_cache, template, ret);
2731 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2733 uint64_t q, xor_hash = 0;
2746 ts.monotonic = le64toh(o->entry.monotonic);
2747 ts.realtime = le64toh(o->entry.realtime);
2749 n = journal_file_entry_n_items(o);
2750 /* alloca() can't take 0, hence let's allocate at least one */
2751 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2753 for (i = 0; i < n; i++) {
2760 q = le64toh(o->entry.items[i].object_offset);
2761 le_hash = o->entry.items[i].hash;
2763 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2767 if (le_hash != o->data.hash)
2770 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2773 /* We hit the limit on 32bit machines */
2774 if ((uint64_t) t != l)
2777 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2778 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2781 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2782 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2786 data = from->compress_buffer;
2789 return -EPROTONOSUPPORT;
2792 data = o->data.payload;
2794 r = journal_file_append_data(to, data, l, &u, &h);
2798 xor_hash ^= le64toh(u->data.hash);
2799 items[i].object_offset = htole64(h);
2800 items[i].hash = u->data.hash;
2802 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2807 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2810 void journal_default_metrics(JournalMetrics *m, int fd) {
2811 uint64_t fs_size = 0;
2813 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2818 if (fstatvfs(fd, &ss) >= 0)
2819 fs_size = ss.f_frsize * ss.f_blocks;
2821 if (m->max_use == (uint64_t) -1) {
2824 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2826 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2827 m->max_use = DEFAULT_MAX_USE_UPPER;
2829 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2830 m->max_use = DEFAULT_MAX_USE_LOWER;
2832 m->max_use = DEFAULT_MAX_USE_LOWER;
2834 m->max_use = PAGE_ALIGN(m->max_use);
2836 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2837 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2840 if (m->max_size == (uint64_t) -1) {
2841 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2843 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2844 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2846 m->max_size = PAGE_ALIGN(m->max_size);
2848 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2849 m->max_size = JOURNAL_FILE_SIZE_MIN;
2851 if (m->max_size*2 > m->max_use)
2852 m->max_use = m->max_size*2;
2854 if (m->min_size == (uint64_t) -1)
2855 m->min_size = JOURNAL_FILE_SIZE_MIN;
2857 m->min_size = PAGE_ALIGN(m->min_size);
2859 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2860 m->min_size = JOURNAL_FILE_SIZE_MIN;
2862 if (m->min_size > m->max_size)
2863 m->max_size = m->min_size;
2866 if (m->keep_free == (uint64_t) -1) {
2869 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2871 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2872 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2875 m->keep_free = DEFAULT_KEEP_FREE;
2878 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2879 format_bytes(a, sizeof(a), m->max_use),
2880 format_bytes(b, sizeof(b), m->max_size),
2881 format_bytes(c, sizeof(c), m->min_size),
2882 format_bytes(d, sizeof(d), m->keep_free));
2885 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2890 if (f->header->head_entry_realtime == 0)
2893 *from = le64toh(f->header->head_entry_realtime);
2897 if (f->header->tail_entry_realtime == 0)
2900 *to = le64toh(f->header->tail_entry_realtime);
2906 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2914 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2918 if (le64toh(o->data.n_entries) <= 0)
2922 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2926 *from = le64toh(o->entry.monotonic);
2930 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2934 r = generic_array_get_plus_one(f,
2935 le64toh(o->data.entry_offset),
2936 le64toh(o->data.entry_array_offset),
2937 le64toh(o->data.n_entries)-1,
2942 *to = le64toh(o->entry.monotonic);
2948 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2951 /* If we gained new header fields we gained new features,
2952 * hence suggest a rotation */
2953 if (le64toh(f->header->header_size) < sizeof(Header)) {
2954 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2958 /* Let's check if the hash tables grew over a certain fill
2959 * level (75%, borrowing this value from Java's hash table
2960 * implementation), and if so suggest a rotation. To calculate
2961 * the fill level we need the n_data field, which only exists
2962 * in newer versions. */
2964 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2965 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2966 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2968 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2969 le64toh(f->header->n_data),
2970 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2971 (unsigned long long) f->last_stat.st_size,
2972 f->last_stat.st_size / le64toh(f->header->n_data));
2976 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2977 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2978 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2980 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2981 le64toh(f->header->n_fields),
2982 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2986 /* Are the data objects properly indexed by field objects? */
2987 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2988 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2989 le64toh(f->header->n_data) > 0 &&
2990 le64toh(f->header->n_fields) == 0)
2993 if (max_file_usec > 0) {
2996 h = le64toh(f->header->head_entry_realtime);
2997 t = now(CLOCK_REALTIME);
2999 if (h > 0 && t > h + max_file_usec)