1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
29 #include <sys/xattr.h>
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 static int journal_file_set_online(JournalFile *f) {
77 if (!(f->fd >= 0 && f->header))
80 switch(f->header->state) {
85 f->header->state = STATE_ONLINE;
94 int journal_file_set_offline(JournalFile *f) {
100 if (!(f->fd >= 0 && f->header))
103 if (f->header->state != STATE_ONLINE)
108 f->header->state = STATE_OFFLINE;
115 void journal_file_close(JournalFile *f) {
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
128 journal_file_set_offline(f);
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 mmap_cache_unref(f->mmap);
139 hashmap_free_free(f->chain_cache);
141 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
142 free(f->compress_buffer);
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
154 gcry_md_close(f->hmac);
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
167 memcpy(h.signature, HEADER_SIGNATURE, 8);
168 h.header_size = htole64(ALIGN64(sizeof(h)));
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
177 r = sd_id128_randomize(&h.file_id);
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
185 h.seqnum_id = h.file_id;
187 k = pwrite(f->fd, &h, sizeof(h), 0);
197 static int journal_file_refresh_header(JournalFile *f) {
203 r = sd_id128_get_machine(&f->header->machine_id);
207 r = sd_id128_get_boot(&boot_id);
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
214 f->header->boot_id = boot_id;
216 journal_file_set_online(f);
218 /* Sync the online state to disk */
224 static int journal_file_verify_header(JournalFile *f) {
229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
243 return -EPROTONOSUPPORT;
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
260 if (f->header->state >= _STATE_MAX)
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
284 sd_id128_t machine_id;
287 r = sd_id128_get_machine(&machine_id);
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
294 state = f->header->state;
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
299 } else if (state == STATE_ARCHIVED)
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
333 if (new_size <= old_size)
336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342 if (fstatvfs(f->fd, &svfs) >= 0) {
345 available = svfs.f_bfree * svfs.f_bsize;
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
352 if (new_size - old_size > available)
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
369 if (fstat(f->fd, &f->last_stat) < 0)
372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
377 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
384 /* Avoid SIGBUS on invalid accesses */
385 if (offset + size > (uint64_t) f->last_stat.st_size) {
386 /* Hmm, out of range? Let's refresh the fstat() data
387 * first, before we trust that check. */
389 if (fstat(f->fd, &f->last_stat) < 0 ||
390 offset + size > (uint64_t) f->last_stat.st_size)
391 return -EADDRNOTAVAIL;
394 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 static uint64_t minimum_header_size(Object *o) {
399 static const uint64_t table[] = {
400 [OBJECT_DATA] = sizeof(DataObject),
401 [OBJECT_FIELD] = sizeof(FieldObject),
402 [OBJECT_ENTRY] = sizeof(EntryObject),
403 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406 [OBJECT_TAG] = sizeof(TagObject),
409 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410 return sizeof(ObjectHeader);
412 return table[o->object.type];
415 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
424 /* Objects may only be located at multiple of 64 bit */
425 if (!VALID64(offset))
428 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
433 s = le64toh(o->object.size);
435 if (s < sizeof(ObjectHeader))
438 if (o->object.type <= OBJECT_UNUSED)
441 if (s < minimum_header_size(o))
444 if (type > 0 && o->object.type != type)
447 if (s > sizeof(ObjectHeader)) {
448 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
459 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
464 r = le64toh(f->header->tail_entry_seqnum) + 1;
467 /* If an external seqnum counter was passed, we update
468 * both the local and the external one, and set it to
469 * the maximum of both */
477 f->header->tail_entry_seqnum = htole64(r);
479 if (f->header->head_entry_seqnum == 0)
480 f->header->head_entry_seqnum = htole64(r);
485 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
492 assert(type > 0 && type < _OBJECT_TYPE_MAX);
493 assert(size >= sizeof(ObjectHeader));
497 r = journal_file_set_online(f);
501 p = le64toh(f->header->tail_object_offset);
503 p = le64toh(f->header->header_size);
505 r = journal_file_move_to_object(f, -1, p, &tail);
509 p += ALIGN64(le64toh(tail->object.size));
512 r = journal_file_allocate(f, p, size);
516 r = journal_file_move_to(f, type, false, p, size, &t);
523 o->object.type = type;
524 o->object.size = htole64(size);
526 f->header->tail_object_offset = htole64(p);
527 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
535 static int journal_file_setup_data_hash_table(JournalFile *f) {
542 /* We estimate that we need 1 hash table entry per 768 of
543 journal file and we want to make sure we never get beyond
544 75% fill level. Calculate the hash table size for the
545 maximum file size based on these metrics. */
547 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
548 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
549 s = DEFAULT_DATA_HASH_TABLE_SIZE;
551 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
553 r = journal_file_append_object(f,
554 OBJECT_DATA_HASH_TABLE,
555 offsetof(Object, hash_table.items) + s,
560 memzero(o->hash_table.items, s);
562 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
563 f->header->data_hash_table_size = htole64(s);
568 static int journal_file_setup_field_hash_table(JournalFile *f) {
575 /* We use a fixed size hash table for the fields as this
576 * number should grow very slowly only */
578 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
579 r = journal_file_append_object(f,
580 OBJECT_FIELD_HASH_TABLE,
581 offsetof(Object, hash_table.items) + s,
586 memzero(o->hash_table.items, s);
588 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
589 f->header->field_hash_table_size = htole64(s);
594 static int journal_file_map_data_hash_table(JournalFile *f) {
601 p = le64toh(f->header->data_hash_table_offset);
602 s = le64toh(f->header->data_hash_table_size);
604 r = journal_file_move_to(f,
605 OBJECT_DATA_HASH_TABLE,
612 f->data_hash_table = t;
616 static int journal_file_map_field_hash_table(JournalFile *f) {
623 p = le64toh(f->header->field_hash_table_offset);
624 s = le64toh(f->header->field_hash_table_size);
626 r = journal_file_move_to(f,
627 OBJECT_FIELD_HASH_TABLE,
634 f->field_hash_table = t;
638 static int journal_file_link_field(
651 if (o->object.type != OBJECT_FIELD)
654 /* This might alter the window we are looking at */
656 o->field.next_hash_offset = o->field.head_data_offset = 0;
658 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
659 p = le64toh(f->field_hash_table[h].tail_hash_offset);
661 f->field_hash_table[h].head_hash_offset = htole64(offset);
663 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
667 o->field.next_hash_offset = htole64(offset);
670 f->field_hash_table[h].tail_hash_offset = htole64(offset);
672 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
673 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
678 static int journal_file_link_data(
691 if (o->object.type != OBJECT_DATA)
694 /* This might alter the window we are looking at */
696 o->data.next_hash_offset = o->data.next_field_offset = 0;
697 o->data.entry_offset = o->data.entry_array_offset = 0;
698 o->data.n_entries = 0;
700 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
701 p = le64toh(f->data_hash_table[h].tail_hash_offset);
703 /* Only entry in the hash table is easy */
704 f->data_hash_table[h].head_hash_offset = htole64(offset);
706 /* Move back to the previous data object, to patch in
709 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
713 o->data.next_hash_offset = htole64(offset);
716 f->data_hash_table[h].tail_hash_offset = htole64(offset);
718 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
719 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
724 int journal_file_find_field_object_with_hash(
726 const void *field, uint64_t size, uint64_t hash,
727 Object **ret, uint64_t *offset) {
729 uint64_t p, osize, h;
733 assert(field && size > 0);
735 osize = offsetof(Object, field.payload) + size;
737 if (f->header->field_hash_table_size == 0)
740 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
741 p = le64toh(f->field_hash_table[h].head_hash_offset);
746 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
750 if (le64toh(o->field.hash) == hash &&
751 le64toh(o->object.size) == osize &&
752 memcmp(o->field.payload, field, size) == 0) {
762 p = le64toh(o->field.next_hash_offset);
768 int journal_file_find_field_object(
770 const void *field, uint64_t size,
771 Object **ret, uint64_t *offset) {
776 assert(field && size > 0);
778 hash = hash64(field, size);
780 return journal_file_find_field_object_with_hash(f,
785 int journal_file_find_data_object_with_hash(
787 const void *data, uint64_t size, uint64_t hash,
788 Object **ret, uint64_t *offset) {
790 uint64_t p, osize, h;
794 assert(data || size == 0);
796 osize = offsetof(Object, data.payload) + size;
798 if (f->header->data_hash_table_size == 0)
801 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
802 p = le64toh(f->data_hash_table[h].head_hash_offset);
807 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
811 if (le64toh(o->data.hash) != hash)
814 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
815 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
823 l -= offsetof(Object, data.payload);
825 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
826 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
831 memcmp(f->compress_buffer, data, size) == 0) {
842 return -EPROTONOSUPPORT;
844 } else if (le64toh(o->object.size) == osize &&
845 memcmp(o->data.payload, data, size) == 0) {
857 p = le64toh(o->data.next_hash_offset);
863 int journal_file_find_data_object(
865 const void *data, uint64_t size,
866 Object **ret, uint64_t *offset) {
871 assert(data || size == 0);
873 hash = hash64(data, size);
875 return journal_file_find_data_object_with_hash(f,
880 static int journal_file_append_field(
882 const void *field, uint64_t size,
883 Object **ret, uint64_t *offset) {
891 assert(field && size > 0);
893 hash = hash64(field, size);
895 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
909 osize = offsetof(Object, field.payload) + size;
910 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
914 o->field.hash = htole64(hash);
915 memcpy(o->field.payload, field, size);
917 r = journal_file_link_field(f, o, p, hash);
921 /* The linking might have altered the window, so let's
922 * refresh our pointer */
923 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
928 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
942 static int journal_file_append_data(
944 const void *data, uint64_t size,
945 Object **ret, uint64_t *offset) {
950 int r, compression = 0;
954 assert(data || size == 0);
956 hash = hash64(data, size);
958 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
972 osize = offsetof(Object, data.payload) + size;
973 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
977 o->data.hash = htole64(hash);
979 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
980 if (f->compress_xz &&
981 size >= COMPRESSION_SIZE_THRESHOLD) {
984 compression = compress_blob(data, size, o->data.payload, &rsize);
987 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
988 o->object.flags |= compression;
990 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
991 size, rsize, object_compressed_to_string(compression));
996 if (!compression && size > 0)
997 memcpy(o->data.payload, data, size);
999 r = journal_file_link_data(f, o, p, hash);
1003 /* The linking might have altered the window, so let's
1004 * refresh our pointer */
1005 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1012 eq = memchr(data, '=', size);
1013 if (eq && eq > data) {
1017 /* Create field object ... */
1018 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1022 /* ... and link it in. */
1023 o->data.next_field_offset = fo->field.head_data_offset;
1024 fo->field.head_data_offset = le64toh(p);
1028 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1042 uint64_t journal_file_entry_n_items(Object *o) {
1045 if (o->object.type != OBJECT_ENTRY)
1048 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1051 uint64_t journal_file_entry_array_n_items(Object *o) {
1054 if (o->object.type != OBJECT_ENTRY_ARRAY)
1057 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1060 uint64_t journal_file_hash_table_n_items(Object *o) {
1063 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1064 o->object.type != OBJECT_FIELD_HASH_TABLE)
1067 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1070 static int link_entry_into_array(JournalFile *f,
1075 uint64_t n = 0, ap = 0, q, i, a, hidx;
1083 a = le64toh(*first);
1084 i = hidx = le64toh(*idx);
1087 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1091 n = journal_file_entry_array_n_items(o);
1093 o->entry_array.items[i] = htole64(p);
1094 *idx = htole64(hidx + 1);
1100 a = le64toh(o->entry_array.next_entry_array_offset);
1111 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1112 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1118 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1123 o->entry_array.items[i] = htole64(p);
1126 *first = htole64(q);
1128 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1132 o->entry_array.next_entry_array_offset = htole64(q);
1135 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1136 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1138 *idx = htole64(hidx + 1);
1143 static int link_entry_into_array_plus_one(JournalFile *f,
1158 *extra = htole64(p);
1162 i = htole64(le64toh(*idx) - 1);
1163 r = link_entry_into_array(f, first, &i, p);
1168 *idx = htole64(le64toh(*idx) + 1);
1172 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1179 p = le64toh(o->entry.items[i].object_offset);
1183 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1187 return link_entry_into_array_plus_one(f,
1188 &o->data.entry_offset,
1189 &o->data.entry_array_offset,
1194 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1202 if (o->object.type != OBJECT_ENTRY)
1205 __sync_synchronize();
1207 /* Link up the entry itself */
1208 r = link_entry_into_array(f,
1209 &f->header->entry_array_offset,
1210 &f->header->n_entries,
1215 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1217 if (f->header->head_entry_realtime == 0)
1218 f->header->head_entry_realtime = o->entry.realtime;
1220 f->header->tail_entry_realtime = o->entry.realtime;
1221 f->header->tail_entry_monotonic = o->entry.monotonic;
1223 f->tail_entry_monotonic_valid = true;
1225 /* Link up the items */
1226 n = journal_file_entry_n_items(o);
1227 for (i = 0; i < n; i++) {
1228 r = journal_file_link_entry_item(f, o, offset, i);
1236 static int journal_file_append_entry_internal(
1238 const dual_timestamp *ts,
1240 const EntryItem items[], unsigned n_items,
1242 Object **ret, uint64_t *offset) {
1249 assert(items || n_items == 0);
1252 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1254 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1258 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1259 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1260 o->entry.realtime = htole64(ts->realtime);
1261 o->entry.monotonic = htole64(ts->monotonic);
1262 o->entry.xor_hash = htole64(xor_hash);
1263 o->entry.boot_id = f->header->boot_id;
1266 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1271 r = journal_file_link_entry(f, o, np);
1284 void journal_file_post_change(JournalFile *f) {
1287 /* inotify() does not receive IN_MODIFY events from file
1288 * accesses done via mmap(). After each access we hence
1289 * trigger IN_MODIFY by truncating the journal file to its
1290 * current size which triggers IN_MODIFY. */
1292 __sync_synchronize();
1294 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1295 log_error("Failed to truncate file to its own size: %m");
1298 static int entry_item_cmp(const void *_a, const void *_b) {
1299 const EntryItem *a = _a, *b = _b;
1301 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1303 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1308 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1312 uint64_t xor_hash = 0;
1313 struct dual_timestamp _ts;
1316 assert(iovec || n_iovec == 0);
1319 dual_timestamp_get(&_ts);
1323 if (f->tail_entry_monotonic_valid &&
1324 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1328 r = journal_file_maybe_append_tag(f, ts->realtime);
1333 /* alloca() can't take 0, hence let's allocate at least one */
1334 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1336 for (i = 0; i < n_iovec; i++) {
1340 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1344 xor_hash ^= le64toh(o->data.hash);
1345 items[i].object_offset = htole64(p);
1346 items[i].hash = o->data.hash;
1349 /* Order by the position on disk, in order to improve seek
1350 * times for rotating media. */
1351 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1353 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1355 journal_file_post_change(f);
1360 typedef struct ChainCacheItem {
1361 uint64_t first; /* the array at the beginning of the chain */
1362 uint64_t array; /* the cached array */
1363 uint64_t begin; /* the first item in the cached array */
1364 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1365 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1368 static void chain_cache_put(
1375 uint64_t last_index) {
1378 /* If the chain item to cache for this chain is the
1379 * first one it's not worth caching anything */
1383 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1384 ci = hashmap_steal_first(h);
1386 ci = new(ChainCacheItem, 1);
1393 if (hashmap_put(h, &ci->first, ci) < 0) {
1398 assert(ci->first == first);
1403 ci->last_index = last_index;
1406 static int generic_array_get(
1410 Object **ret, uint64_t *offset) {
1413 uint64_t p = 0, a, t = 0;
1421 /* Try the chain cache first */
1422 ci = hashmap_get(f->chain_cache, &first);
1423 if (ci && i > ci->total) {
1432 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1436 k = journal_file_entry_array_n_items(o);
1438 p = le64toh(o->entry_array.items[i]);
1444 a = le64toh(o->entry_array.next_entry_array_offset);
1450 /* Let's cache this item for the next invocation */
1451 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1453 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1466 static int generic_array_get_plus_one(
1471 Object **ret, uint64_t *offset) {
1480 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1493 return generic_array_get(f, first, i-1, ret, offset);
1502 static int generic_array_bisect(
1507 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1508 direction_t direction,
1513 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1514 bool subtract_one = false;
1515 Object *o, *array = NULL;
1520 assert(test_object);
1522 /* Start with the first array in the chain */
1525 ci = hashmap_get(f->chain_cache, &first);
1526 if (ci && n > ci->total) {
1527 /* Ah, we have iterated this bisection array chain
1528 * previously! Let's see if we can skip ahead in the
1529 * chain, as far as the last time. But we can't jump
1530 * backwards in the chain, so let's check that
1533 r = test_object(f, ci->begin, needle);
1537 if (r == TEST_LEFT) {
1538 /* OK, what we are looking for is right of the
1539 * begin of this EntryArray, so let's jump
1540 * straight to previously cached array in the
1546 last_index = ci->last_index;
1551 uint64_t left, right, k, lp;
1553 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1557 k = journal_file_entry_array_n_items(array);
1563 lp = p = le64toh(array->entry_array.items[i]);
1567 r = test_object(f, p, needle);
1571 if (r == TEST_FOUND)
1572 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574 if (r == TEST_RIGHT) {
1578 if (last_index != (uint64_t) -1) {
1579 assert(last_index <= right);
1581 /* If we cached the last index we
1582 * looked at, let's try to not to jump
1583 * too wildly around and see if we can
1584 * limit the range to look at early to
1585 * the immediate neighbors of the last
1586 * index we looked at. */
1588 if (last_index > 0) {
1589 uint64_t x = last_index - 1;
1591 p = le64toh(array->entry_array.items[x]);
1595 r = test_object(f, p, needle);
1599 if (r == TEST_FOUND)
1600 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602 if (r == TEST_RIGHT)
1608 if (last_index < right) {
1609 uint64_t y = last_index + 1;
1611 p = le64toh(array->entry_array.items[y]);
1615 r = test_object(f, p, needle);
1619 if (r == TEST_FOUND)
1620 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622 if (r == TEST_RIGHT)
1630 if (left == right) {
1631 if (direction == DIRECTION_UP)
1632 subtract_one = true;
1638 assert(left < right);
1639 i = (left + right) / 2;
1641 p = le64toh(array->entry_array.items[i]);
1645 r = test_object(f, p, needle);
1649 if (r == TEST_FOUND)
1650 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1652 if (r == TEST_RIGHT)
1660 if (direction == DIRECTION_UP) {
1662 subtract_one = true;
1673 last_index = (uint64_t) -1;
1674 a = le64toh(array->entry_array.next_entry_array_offset);
1680 if (subtract_one && t == 0 && i == 0)
1683 /* Let's cache this item for the next invocation */
1684 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1686 if (subtract_one && i == 0)
1688 else if (subtract_one)
1689 p = le64toh(array->entry_array.items[i-1]);
1691 p = le64toh(array->entry_array.items[i]);
1693 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1704 *idx = t + i + (subtract_one ? -1 : 0);
1710 static int generic_array_bisect_plus_one(
1716 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1717 direction_t direction,
1723 bool step_back = false;
1727 assert(test_object);
1732 /* This bisects the array in object 'first', but first checks
1734 r = test_object(f, extra, needle);
1738 if (r == TEST_FOUND)
1739 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1741 /* if we are looking with DIRECTION_UP then we need to first
1742 see if in the actual array there is a matching entry, and
1743 return the last one of that. But if there isn't any we need
1744 to return this one. Hence remember this, and return it
1747 step_back = direction == DIRECTION_UP;
1749 if (r == TEST_RIGHT) {
1750 if (direction == DIRECTION_DOWN)
1756 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1758 if (r == 0 && step_back)
1767 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1783 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1789 else if (p < needle)
1795 int journal_file_move_to_entry_by_offset(
1798 direction_t direction,
1802 return generic_array_bisect(f,
1803 le64toh(f->header->entry_array_offset),
1804 le64toh(f->header->n_entries),
1812 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1819 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1823 if (le64toh(o->entry.seqnum) == needle)
1825 else if (le64toh(o->entry.seqnum) < needle)
1831 int journal_file_move_to_entry_by_seqnum(
1834 direction_t direction,
1838 return generic_array_bisect(f,
1839 le64toh(f->header->entry_array_offset),
1840 le64toh(f->header->n_entries),
1847 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1854 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1858 if (le64toh(o->entry.realtime) == needle)
1860 else if (le64toh(o->entry.realtime) < needle)
1866 int journal_file_move_to_entry_by_realtime(
1869 direction_t direction,
1873 return generic_array_bisect(f,
1874 le64toh(f->header->entry_array_offset),
1875 le64toh(f->header->n_entries),
1877 test_object_realtime,
1882 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1889 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1893 if (le64toh(o->entry.monotonic) == needle)
1895 else if (le64toh(o->entry.monotonic) < needle)
1901 static inline int find_data_object_by_boot_id(
1906 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1908 sd_id128_to_string(boot_id, t + 9);
1909 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1912 int journal_file_move_to_entry_by_monotonic(
1916 direction_t direction,
1925 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1931 return generic_array_bisect_plus_one(f,
1932 le64toh(o->data.entry_offset),
1933 le64toh(o->data.entry_array_offset),
1934 le64toh(o->data.n_entries),
1936 test_object_monotonic,
1941 int journal_file_next_entry(
1943 Object *o, uint64_t p,
1944 direction_t direction,
1945 Object **ret, uint64_t *offset) {
1951 assert(p > 0 || !o);
1953 n = le64toh(f->header->n_entries);
1958 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1960 if (o->object.type != OBJECT_ENTRY)
1963 r = generic_array_bisect(f,
1964 le64toh(f->header->entry_array_offset),
1965 le64toh(f->header->n_entries),
1974 if (direction == DIRECTION_DOWN) {
1987 /* And jump to it */
1988 r = generic_array_get(f,
1989 le64toh(f->header->entry_array_offset),
1996 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1997 log_debug("%s: entry array corrupted at entry %"PRIu64,
2008 int journal_file_skip_entry(
2010 Object *o, uint64_t p,
2012 Object **ret, uint64_t *offset) {
2021 if (o->object.type != OBJECT_ENTRY)
2024 r = generic_array_bisect(f,
2025 le64toh(f->header->entry_array_offset),
2026 le64toh(f->header->n_entries),
2035 /* Calculate new index */
2037 if ((uint64_t) -skip >= i)
2040 i = i - (uint64_t) -skip;
2042 i += (uint64_t) skip;
2044 n = le64toh(f->header->n_entries);
2051 return generic_array_get(f,
2052 le64toh(f->header->entry_array_offset),
2057 int journal_file_next_entry_for_data(
2059 Object *o, uint64_t p,
2060 uint64_t data_offset,
2061 direction_t direction,
2062 Object **ret, uint64_t *offset) {
2069 assert(p > 0 || !o);
2071 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2075 n = le64toh(d->data.n_entries);
2080 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2082 if (o->object.type != OBJECT_ENTRY)
2085 r = generic_array_bisect_plus_one(f,
2086 le64toh(d->data.entry_offset),
2087 le64toh(d->data.entry_array_offset),
2088 le64toh(d->data.n_entries),
2098 if (direction == DIRECTION_DOWN) {
2112 return generic_array_get_plus_one(f,
2113 le64toh(d->data.entry_offset),
2114 le64toh(d->data.entry_array_offset),
2119 int journal_file_move_to_entry_by_offset_for_data(
2121 uint64_t data_offset,
2123 direction_t direction,
2124 Object **ret, uint64_t *offset) {
2131 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2135 return generic_array_bisect_plus_one(f,
2136 le64toh(d->data.entry_offset),
2137 le64toh(d->data.entry_array_offset),
2138 le64toh(d->data.n_entries),
2145 int journal_file_move_to_entry_by_monotonic_for_data(
2147 uint64_t data_offset,
2150 direction_t direction,
2151 Object **ret, uint64_t *offset) {
2159 /* First, seek by time */
2160 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2166 r = generic_array_bisect_plus_one(f,
2167 le64toh(o->data.entry_offset),
2168 le64toh(o->data.entry_array_offset),
2169 le64toh(o->data.n_entries),
2171 test_object_monotonic,
2177 /* And now, continue seeking until we find an entry that
2178 * exists in both bisection arrays */
2184 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2188 r = generic_array_bisect_plus_one(f,
2189 le64toh(d->data.entry_offset),
2190 le64toh(d->data.entry_array_offset),
2191 le64toh(d->data.n_entries),
2199 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2203 r = generic_array_bisect_plus_one(f,
2204 le64toh(o->data.entry_offset),
2205 le64toh(o->data.entry_array_offset),
2206 le64toh(o->data.n_entries),
2228 int journal_file_move_to_entry_by_seqnum_for_data(
2230 uint64_t data_offset,
2232 direction_t direction,
2233 Object **ret, uint64_t *offset) {
2240 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2244 return generic_array_bisect_plus_one(f,
2245 le64toh(d->data.entry_offset),
2246 le64toh(d->data.entry_array_offset),
2247 le64toh(d->data.n_entries),
2254 int journal_file_move_to_entry_by_realtime_for_data(
2256 uint64_t data_offset,
2258 direction_t direction,
2259 Object **ret, uint64_t *offset) {
2266 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2270 return generic_array_bisect_plus_one(f,
2271 le64toh(d->data.entry_offset),
2272 le64toh(d->data.entry_array_offset),
2273 le64toh(d->data.n_entries),
2275 test_object_realtime,
2280 void journal_file_dump(JournalFile *f) {
2287 journal_file_print_header(f);
2289 p = le64toh(f->header->header_size);
2291 r = journal_file_move_to_object(f, -1, p, &o);
2295 switch (o->object.type) {
2298 printf("Type: OBJECT_UNUSED\n");
2302 printf("Type: OBJECT_DATA\n");
2306 printf("Type: OBJECT_FIELD\n");
2310 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2311 le64toh(o->entry.seqnum),
2312 le64toh(o->entry.monotonic),
2313 le64toh(o->entry.realtime));
2316 case OBJECT_FIELD_HASH_TABLE:
2317 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2320 case OBJECT_DATA_HASH_TABLE:
2321 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2324 case OBJECT_ENTRY_ARRAY:
2325 printf("Type: OBJECT_ENTRY_ARRAY\n");
2329 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2330 le64toh(o->tag.seqnum),
2331 le64toh(o->tag.epoch));
2335 printf("Type: unknown (%u)\n", o->object.type);
2339 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2340 printf("Flags: %s\n",
2341 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
2343 if (p == le64toh(f->header->tail_object_offset))
2346 p = p + ALIGN64(le64toh(o->object.size));
2351 log_error("File corrupt");
2354 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2357 x = format_timestamp(buf, l, t);
2363 void journal_file_print_header(JournalFile *f) {
2364 char a[33], b[33], c[33], d[33];
2365 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2367 char bytes[FORMAT_BYTES_MAX];
2371 printf("File Path: %s\n"
2375 "Sequential Number ID: %s\n"
2377 "Compatible Flags:%s%s\n"
2378 "Incompatible Flags:%s%s%s\n"
2379 "Header size: %"PRIu64"\n"
2380 "Arena size: %"PRIu64"\n"
2381 "Data Hash Table Size: %"PRIu64"\n"
2382 "Field Hash Table Size: %"PRIu64"\n"
2383 "Rotate Suggested: %s\n"
2384 "Head Sequential Number: %"PRIu64"\n"
2385 "Tail Sequential Number: %"PRIu64"\n"
2386 "Head Realtime Timestamp: %s\n"
2387 "Tail Realtime Timestamp: %s\n"
2388 "Tail Monotonic Timestamp: %s\n"
2389 "Objects: %"PRIu64"\n"
2390 "Entry Objects: %"PRIu64"\n",
2392 sd_id128_to_string(f->header->file_id, a),
2393 sd_id128_to_string(f->header->machine_id, b),
2394 sd_id128_to_string(f->header->boot_id, c),
2395 sd_id128_to_string(f->header->seqnum_id, d),
2396 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2397 f->header->state == STATE_ONLINE ? "ONLINE" :
2398 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2399 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2400 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2401 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2402 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2403 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
2404 le64toh(f->header->header_size),
2405 le64toh(f->header->arena_size),
2406 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2407 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2408 yes_no(journal_file_rotate_suggested(f, 0)),
2409 le64toh(f->header->head_entry_seqnum),
2410 le64toh(f->header->tail_entry_seqnum),
2411 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2412 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2413 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2414 le64toh(f->header->n_objects),
2415 le64toh(f->header->n_entries));
2417 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2418 printf("Data Objects: %"PRIu64"\n"
2419 "Data Hash Table Fill: %.1f%%\n",
2420 le64toh(f->header->n_data),
2421 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2423 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2424 printf("Field Objects: %"PRIu64"\n"
2425 "Field Hash Table Fill: %.1f%%\n",
2426 le64toh(f->header->n_fields),
2427 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2429 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2430 printf("Tag Objects: %"PRIu64"\n",
2431 le64toh(f->header->n_tags));
2432 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2433 printf("Entry Array Objects: %"PRIu64"\n",
2434 le64toh(f->header->n_entry_arrays));
2436 if (fstat(f->fd, &st) >= 0)
2437 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2440 int journal_file_open(
2446 JournalMetrics *metrics,
2447 MMapCache *mmap_cache,
2448 JournalFile *template,
2449 JournalFile **ret) {
2453 bool newly_created = false;
2458 if ((flags & O_ACCMODE) != O_RDONLY &&
2459 (flags & O_ACCMODE) != O_RDWR)
2462 if (!endswith(fname, ".journal") &&
2463 !endswith(fname, ".journal~"))
2466 f = new0(JournalFile, 1);
2474 f->prot = prot_from_flags(flags);
2475 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2476 #if defined(HAVE_LZ4)
2477 f->compress_lz4 = compress;
2478 #elif defined(HAVE_XZ)
2479 f->compress_xz = compress;
2486 f->mmap = mmap_cache_ref(mmap_cache);
2488 f->mmap = mmap_cache_new();
2495 f->path = strdup(fname);
2501 f->chain_cache = hashmap_new(&uint64_hash_ops);
2502 if (!f->chain_cache) {
2507 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2513 if (fstat(f->fd, &f->last_stat) < 0) {
2518 if (f->last_stat.st_size == 0 && f->writable) {
2521 /* Let's attach the creation time to the journal file,
2522 * so that the vacuuming code knows the age of this
2523 * file even if the file might end up corrupted one
2524 * day... Ideally we'd just use the creation time many
2525 * file systems maintain for each file, but there is
2526 * currently no usable API to query this, hence let's
2527 * emulate this via extended attributes. If extended
2528 * attributes are not supported we'll just skip this,
2529 * and rely solely on mtime/atime/ctime of the file.*/
2531 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2532 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2535 /* Try to load the FSPRG state, and if we can't, then
2536 * just don't do sealing */
2538 r = journal_file_fss_load(f);
2544 r = journal_file_init_header(f, template);
2548 if (fstat(f->fd, &f->last_stat) < 0) {
2553 newly_created = true;
2556 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2561 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2562 if (f->header == MAP_FAILED) {
2568 if (!newly_created) {
2569 r = journal_file_verify_header(f);
2575 if (!newly_created && f->writable) {
2576 r = journal_file_fss_load(f);
2584 journal_default_metrics(metrics, f->fd);
2585 f->metrics = *metrics;
2586 } else if (template)
2587 f->metrics = template->metrics;
2589 r = journal_file_refresh_header(f);
2595 r = journal_file_hmac_setup(f);
2600 if (newly_created) {
2601 r = journal_file_setup_field_hash_table(f);
2605 r = journal_file_setup_data_hash_table(f);
2610 r = journal_file_append_first_tag(f);
2616 r = journal_file_map_field_hash_table(f);
2620 r = journal_file_map_data_hash_table(f);
2628 journal_file_close(f);
2633 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2634 _cleanup_free_ char *p = NULL;
2636 JournalFile *old_file, *new_file = NULL;
2644 if (!old_file->writable)
2647 if (!endswith(old_file->path, ".journal"))
2650 l = strlen(old_file->path);
2651 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2652 (int) l - 8, old_file->path,
2653 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2654 le64toh((*f)->header->head_entry_seqnum),
2655 le64toh((*f)->header->head_entry_realtime));
2659 r = rename(old_file->path, p);
2663 old_file->header->state = STATE_ARCHIVED;
2665 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2666 journal_file_close(old_file);
2672 int journal_file_open_reliably(
2678 JournalMetrics *metrics,
2679 MMapCache *mmap_cache,
2680 JournalFile *template,
2681 JournalFile **ret) {
2685 _cleanup_free_ char *p = NULL;
2687 r = journal_file_open(fname, flags, mode, compress, seal,
2688 metrics, mmap_cache, template, ret);
2689 if (r != -EBADMSG && /* corrupted */
2690 r != -ENODATA && /* truncated */
2691 r != -EHOSTDOWN && /* other machine */
2692 r != -EPROTONOSUPPORT && /* incompatible feature */
2693 r != -EBUSY && /* unclean shutdown */
2694 r != -ESHUTDOWN /* already archived */)
2697 if ((flags & O_ACCMODE) == O_RDONLY)
2700 if (!(flags & O_CREAT))
2703 if (!endswith(fname, ".journal"))
2706 /* The file is corrupted. Rotate it away and try it again (but only once) */
2709 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2711 (unsigned long long) now(CLOCK_REALTIME),
2715 r = rename(fname, p);
2719 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2721 return journal_file_open(fname, flags, mode, compress, seal,
2722 metrics, mmap_cache, template, ret);
2725 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2727 uint64_t q, xor_hash = 0;
2740 ts.monotonic = le64toh(o->entry.monotonic);
2741 ts.realtime = le64toh(o->entry.realtime);
2743 n = journal_file_entry_n_items(o);
2744 /* alloca() can't take 0, hence let's allocate at least one */
2745 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2747 for (i = 0; i < n; i++) {
2754 q = le64toh(o->entry.items[i].object_offset);
2755 le_hash = o->entry.items[i].hash;
2757 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2761 if (le_hash != o->data.hash)
2764 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2767 /* We hit the limit on 32bit machines */
2768 if ((uint64_t) t != l)
2771 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
2772 #if defined(HAVE_XZ) || defined(HAVE_LZ4)
2775 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2776 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2780 data = from->compress_buffer;
2783 return -EPROTONOSUPPORT;
2786 data = o->data.payload;
2788 r = journal_file_append_data(to, data, l, &u, &h);
2792 xor_hash ^= le64toh(u->data.hash);
2793 items[i].object_offset = htole64(h);
2794 items[i].hash = u->data.hash;
2796 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2801 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2804 void journal_default_metrics(JournalMetrics *m, int fd) {
2805 uint64_t fs_size = 0;
2807 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2812 if (fstatvfs(fd, &ss) >= 0)
2813 fs_size = ss.f_frsize * ss.f_blocks;
2815 if (m->max_use == (uint64_t) -1) {
2818 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2820 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2821 m->max_use = DEFAULT_MAX_USE_UPPER;
2823 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2824 m->max_use = DEFAULT_MAX_USE_LOWER;
2826 m->max_use = DEFAULT_MAX_USE_LOWER;
2828 m->max_use = PAGE_ALIGN(m->max_use);
2830 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2831 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2834 if (m->max_size == (uint64_t) -1) {
2835 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2837 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2838 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2840 m->max_size = PAGE_ALIGN(m->max_size);
2842 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2843 m->max_size = JOURNAL_FILE_SIZE_MIN;
2845 if (m->max_size*2 > m->max_use)
2846 m->max_use = m->max_size*2;
2848 if (m->min_size == (uint64_t) -1)
2849 m->min_size = JOURNAL_FILE_SIZE_MIN;
2851 m->min_size = PAGE_ALIGN(m->min_size);
2853 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2854 m->min_size = JOURNAL_FILE_SIZE_MIN;
2856 if (m->min_size > m->max_size)
2857 m->max_size = m->min_size;
2860 if (m->keep_free == (uint64_t) -1) {
2863 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2865 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2866 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2869 m->keep_free = DEFAULT_KEEP_FREE;
2872 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2873 format_bytes(a, sizeof(a), m->max_use),
2874 format_bytes(b, sizeof(b), m->max_size),
2875 format_bytes(c, sizeof(c), m->min_size),
2876 format_bytes(d, sizeof(d), m->keep_free));
2879 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2884 if (f->header->head_entry_realtime == 0)
2887 *from = le64toh(f->header->head_entry_realtime);
2891 if (f->header->tail_entry_realtime == 0)
2894 *to = le64toh(f->header->tail_entry_realtime);
2900 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2908 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2912 if (le64toh(o->data.n_entries) <= 0)
2916 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2920 *from = le64toh(o->entry.monotonic);
2924 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2928 r = generic_array_get_plus_one(f,
2929 le64toh(o->data.entry_offset),
2930 le64toh(o->data.entry_array_offset),
2931 le64toh(o->data.n_entries)-1,
2936 *to = le64toh(o->entry.monotonic);
2942 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2945 /* If we gained new header fields we gained new features,
2946 * hence suggest a rotation */
2947 if (le64toh(f->header->header_size) < sizeof(Header)) {
2948 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2952 /* Let's check if the hash tables grew over a certain fill
2953 * level (75%, borrowing this value from Java's hash table
2954 * implementation), and if so suggest a rotation. To calculate
2955 * the fill level we need the n_data field, which only exists
2956 * in newer versions. */
2958 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2959 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2960 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2962 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2963 le64toh(f->header->n_data),
2964 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2965 (unsigned long long) f->last_stat.st_size,
2966 f->last_stat.st_size / le64toh(f->header->n_data));
2970 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2971 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2972 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2974 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2975 le64toh(f->header->n_fields),
2976 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2980 /* Are the data objects properly indexed by field objects? */
2981 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2982 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2983 le64toh(f->header->n_data) > 0 &&
2984 le64toh(f->header->n_fields) == 0)
2987 if (max_file_usec > 0) {
2990 h = le64toh(f->header->head_entry_realtime);
2991 t = now(CLOCK_REALTIME);
2993 if (h > 0 && t > h + max_file_usec)