1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
74 static int journal_file_set_online(JournalFile *f) {
80 if (!(f->fd >= 0 && f->header))
83 switch(f->header->state) {
88 f->header->state = STATE_ONLINE;
97 int journal_file_set_offline(JournalFile *f) {
103 if (!(f->fd >= 0 && f->header))
106 if (f->header->state != STATE_ONLINE)
111 f->header->state = STATE_OFFLINE;
118 void journal_file_close(JournalFile *f) {
122 /* Write the final tag */
123 if (f->seal && f->writable)
124 journal_file_append_tag(f);
127 /* Sync everything to disk, before we mark the file offline */
128 if (f->mmap && f->fd >= 0)
129 mmap_cache_close_fd(f->mmap, f->fd);
131 journal_file_set_offline(f);
134 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 close_nointr_nofail(f->fd);
142 mmap_cache_unref(f->mmap);
144 hashmap_free_free(f->chain_cache);
147 free(f->compress_buffer);
152 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153 else if (f->fsprg_state)
154 free(f->fsprg_state);
159 gcry_md_close(f->hmac);
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
173 memcpy(h.signature, HEADER_SIGNATURE, 8);
174 h.header_size = htole64(ALIGN64(sizeof(h)));
176 h.incompatible_flags =
177 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
180 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
182 r = sd_id128_randomize(&h.file_id);
187 h.seqnum_id = template->header->seqnum_id;
188 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
190 h.seqnum_id = h.file_id;
192 k = pwrite(f->fd, &h, sizeof(h), 0);
202 static int journal_file_refresh_header(JournalFile *f) {
208 r = sd_id128_get_machine(&f->header->machine_id);
212 r = sd_id128_get_boot(&boot_id);
216 if (sd_id128_equal(boot_id, f->header->boot_id))
217 f->tail_entry_monotonic_valid = true;
219 f->header->boot_id = boot_id;
221 journal_file_set_online(f);
223 /* Sync the online state to disk */
229 static int journal_file_verify_header(JournalFile *f) {
232 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
235 /* In both read and write mode we refuse to open files with
236 * incompatible flags we don't know */
238 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239 return -EPROTONOSUPPORT;
241 if (f->header->incompatible_flags != 0)
242 return -EPROTONOSUPPORT;
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
249 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250 return -EPROTONOSUPPORT;
252 if (f->header->compatible_flags != 0)
253 return -EPROTONOSUPPORT;
257 if (f->header->state >= _STATE_MAX)
260 /* The first addition was n_data, so check that we are at least this large */
261 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
264 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
267 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
270 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
273 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275 !VALID64(le64toh(f->header->tail_object_offset)) ||
276 !VALID64(le64toh(f->header->entry_array_offset)))
279 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
287 sd_id128_t machine_id;
290 r = sd_id128_get_machine(&machine_id);
294 if (!sd_id128_equal(machine_id, f->header->machine_id))
297 state = f->header->state;
299 if (state == STATE_ONLINE) {
300 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
302 } else if (state == STATE_ARCHIVED)
304 else if (state != STATE_OFFLINE) {
305 log_debug("Journal file %s has unknown state %u.", f->path, state);
310 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
312 f->seal = JOURNAL_HEADER_SEALED(f->header);
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318 uint64_t old_size, new_size;
323 /* We assume that this file is not sparse, and we know that
324 * for sure, since we always call posix_fallocate()
328 le64toh(f->header->header_size) +
329 le64toh(f->header->arena_size);
331 new_size = PAGE_ALIGN(offset + size);
332 if (new_size < le64toh(f->header->header_size))
333 new_size = le64toh(f->header->header_size);
335 if (new_size <= old_size)
338 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
341 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
344 if (fstatvfs(f->fd, &svfs) >= 0) {
347 available = svfs.f_bfree * svfs.f_bsize;
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
354 if (new_size - old_size > available)
359 /* Increase by larger blocks at once */
360 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362 new_size = f->metrics.max_size;
364 /* Note that the glibc fallocate() fallback is very
365 inefficient, hence we try to minimize the allocation area
367 r = posix_fallocate(f->fd, old_size, new_size - old_size);
371 if (fstat(f->fd, &f->last_stat) < 0)
374 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
379 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
386 /* Avoid SIGBUS on invalid accesses */
387 if (offset + size > (uint64_t) f->last_stat.st_size) {
388 /* Hmm, out of range? Let's refresh the fstat() data
389 * first, before we trust that check. */
391 if (fstat(f->fd, &f->last_stat) < 0 ||
392 offset + size > (uint64_t) f->last_stat.st_size)
393 return -EADDRNOTAVAIL;
396 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
399 static uint64_t minimum_header_size(Object *o) {
401 static const uint64_t table[] = {
402 [OBJECT_DATA] = sizeof(DataObject),
403 [OBJECT_FIELD] = sizeof(FieldObject),
404 [OBJECT_ENTRY] = sizeof(EntryObject),
405 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408 [OBJECT_TAG] = sizeof(TagObject),
411 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412 return sizeof(ObjectHeader);
414 return table[o->object.type];
417 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
427 /* Objects may only be located at multiple of 64 bit */
428 if (!VALID64(offset))
431 /* One context for each type, plus one catch-all for the rest */
432 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
434 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
439 s = le64toh(o->object.size);
441 if (s < sizeof(ObjectHeader))
444 if (o->object.type <= OBJECT_UNUSED)
447 if (s < minimum_header_size(o))
450 if (type > 0 && o->object.type != type)
453 if (s > sizeof(ObjectHeader)) {
454 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
465 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
470 r = le64toh(f->header->tail_entry_seqnum) + 1;
473 /* If an external seqnum counter was passed, we update
474 * both the local and the external one, and set it to
475 * the maximum of both */
483 f->header->tail_entry_seqnum = htole64(r);
485 if (f->header->head_entry_seqnum == 0)
486 f->header->head_entry_seqnum = htole64(r);
491 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
498 assert(type > 0 && type < _OBJECT_TYPE_MAX);
499 assert(size >= sizeof(ObjectHeader));
503 r = journal_file_set_online(f);
507 p = le64toh(f->header->tail_object_offset);
509 p = le64toh(f->header->header_size);
511 r = journal_file_move_to_object(f, -1, p, &tail);
515 p += ALIGN64(le64toh(tail->object.size));
518 r = journal_file_allocate(f, p, size);
522 r = journal_file_move_to(f, type, false, p, size, &t);
529 o->object.type = type;
530 o->object.size = htole64(size);
532 f->header->tail_object_offset = htole64(p);
533 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
541 static int journal_file_setup_data_hash_table(JournalFile *f) {
548 /* We estimate that we need 1 hash table entry per 768 of
549 journal file and we want to make sure we never get beyond
550 75% fill level. Calculate the hash table size for the
551 maximum file size based on these metrics. */
553 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
554 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555 s = DEFAULT_DATA_HASH_TABLE_SIZE;
557 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
559 r = journal_file_append_object(f,
560 OBJECT_DATA_HASH_TABLE,
561 offsetof(Object, hash_table.items) + s,
566 memset(o->hash_table.items, 0, s);
568 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569 f->header->data_hash_table_size = htole64(s);
574 static int journal_file_setup_field_hash_table(JournalFile *f) {
581 /* We use a fixed size hash table for the fields as this
582 * number should grow very slowly only */
584 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585 r = journal_file_append_object(f,
586 OBJECT_FIELD_HASH_TABLE,
587 offsetof(Object, hash_table.items) + s,
592 memset(o->hash_table.items, 0, s);
594 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595 f->header->field_hash_table_size = htole64(s);
600 static int journal_file_map_data_hash_table(JournalFile *f) {
607 p = le64toh(f->header->data_hash_table_offset);
608 s = le64toh(f->header->data_hash_table_size);
610 r = journal_file_move_to(f,
611 OBJECT_DATA_HASH_TABLE,
618 f->data_hash_table = t;
622 static int journal_file_map_field_hash_table(JournalFile *f) {
629 p = le64toh(f->header->field_hash_table_offset);
630 s = le64toh(f->header->field_hash_table_size);
632 r = journal_file_move_to(f,
633 OBJECT_FIELD_HASH_TABLE,
640 f->field_hash_table = t;
644 static int journal_file_link_field(
657 if (o->object.type != OBJECT_FIELD)
660 /* This might alter the window we are looking at */
662 o->field.next_hash_offset = o->field.head_data_offset = 0;
664 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665 p = le64toh(f->field_hash_table[h].tail_hash_offset);
667 f->field_hash_table[h].head_hash_offset = htole64(offset);
669 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
673 o->field.next_hash_offset = htole64(offset);
676 f->field_hash_table[h].tail_hash_offset = htole64(offset);
678 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
684 static int journal_file_link_data(
697 if (o->object.type != OBJECT_DATA)
700 /* This might alter the window we are looking at */
702 o->data.next_hash_offset = o->data.next_field_offset = 0;
703 o->data.entry_offset = o->data.entry_array_offset = 0;
704 o->data.n_entries = 0;
706 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
707 p = le64toh(f->data_hash_table[h].tail_hash_offset);
709 /* Only entry in the hash table is easy */
710 f->data_hash_table[h].head_hash_offset = htole64(offset);
712 /* Move back to the previous data object, to patch in
715 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
719 o->data.next_hash_offset = htole64(offset);
722 f->data_hash_table[h].tail_hash_offset = htole64(offset);
724 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
730 int journal_file_find_field_object_with_hash(
732 const void *field, uint64_t size, uint64_t hash,
733 Object **ret, uint64_t *offset) {
735 uint64_t p, osize, h;
739 assert(field && size > 0);
741 osize = offsetof(Object, field.payload) + size;
743 if (f->header->field_hash_table_size == 0)
746 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747 p = le64toh(f->field_hash_table[h].head_hash_offset);
752 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
756 if (le64toh(o->field.hash) == hash &&
757 le64toh(o->object.size) == osize &&
758 memcmp(o->field.payload, field, size) == 0) {
768 p = le64toh(o->field.next_hash_offset);
774 int journal_file_find_field_object(
776 const void *field, uint64_t size,
777 Object **ret, uint64_t *offset) {
782 assert(field && size > 0);
784 hash = hash64(field, size);
786 return journal_file_find_field_object_with_hash(f,
791 int journal_file_find_data_object_with_hash(
793 const void *data, uint64_t size, uint64_t hash,
794 Object **ret, uint64_t *offset) {
796 uint64_t p, osize, h;
800 assert(data || size == 0);
802 osize = offsetof(Object, data.payload) + size;
804 if (f->header->data_hash_table_size == 0)
807 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808 p = le64toh(f->data_hash_table[h].head_hash_offset);
813 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
817 if (le64toh(o->data.hash) != hash)
820 if (o->object.flags & OBJECT_COMPRESSED) {
824 l = le64toh(o->object.size);
825 if (l <= offsetof(Object, data.payload))
828 l -= offsetof(Object, data.payload);
830 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
834 memcmp(f->compress_buffer, data, size) == 0) {
845 return -EPROTONOSUPPORT;
848 } else if (le64toh(o->object.size) == osize &&
849 memcmp(o->data.payload, data, size) == 0) {
861 p = le64toh(o->data.next_hash_offset);
867 int journal_file_find_data_object(
869 const void *data, uint64_t size,
870 Object **ret, uint64_t *offset) {
875 assert(data || size == 0);
877 hash = hash64(data, size);
879 return journal_file_find_data_object_with_hash(f,
884 static int journal_file_append_field(
886 const void *field, uint64_t size,
887 Object **ret, uint64_t *offset) {
895 assert(field && size > 0);
897 hash = hash64(field, size);
899 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
913 osize = offsetof(Object, field.payload) + size;
914 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
918 o->field.hash = htole64(hash);
919 memcpy(o->field.payload, field, size);
921 r = journal_file_link_field(f, o, p, hash);
925 /* The linking might have altered the window, so let's
926 * refresh our pointer */
927 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
932 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
946 static int journal_file_append_data(
948 const void *data, uint64_t size,
949 Object **ret, uint64_t *offset) {
955 bool compressed = false;
959 assert(data || size == 0);
961 hash = hash64(data, size);
963 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
977 osize = offsetof(Object, data.payload) + size;
978 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
982 o->data.hash = htole64(hash);
986 size >= COMPRESSION_SIZE_THRESHOLD) {
989 compressed = compress_blob(data, size, o->data.payload, &rsize);
992 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
993 o->object.flags |= OBJECT_COMPRESSED;
995 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
1000 if (!compressed && size > 0)
1001 memcpy(o->data.payload, data, size);
1003 r = journal_file_link_data(f, o, p, hash);
1007 /* The linking might have altered the window, so let's
1008 * refresh our pointer */
1009 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1016 eq = memchr(data, '=', size);
1017 if (eq && eq > data) {
1021 /* Create field object ... */
1022 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1026 /* ... and link it in. */
1027 o->data.next_field_offset = fo->field.head_data_offset;
1028 fo->field.head_data_offset = le64toh(p);
1032 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1046 uint64_t journal_file_entry_n_items(Object *o) {
1049 if (o->object.type != OBJECT_ENTRY)
1052 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1055 uint64_t journal_file_entry_array_n_items(Object *o) {
1058 if (o->object.type != OBJECT_ENTRY_ARRAY)
1061 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1064 uint64_t journal_file_hash_table_n_items(Object *o) {
1067 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068 o->object.type != OBJECT_FIELD_HASH_TABLE)
1071 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1074 static int link_entry_into_array(JournalFile *f,
1079 uint64_t n = 0, ap = 0, q, i, a, hidx;
1087 a = le64toh(*first);
1088 i = hidx = le64toh(*idx);
1091 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1095 n = journal_file_entry_array_n_items(o);
1097 o->entry_array.items[i] = htole64(p);
1098 *idx = htole64(hidx + 1);
1104 a = le64toh(o->entry_array.next_entry_array_offset);
1115 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1122 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1127 o->entry_array.items[i] = htole64(p);
1130 *first = htole64(q);
1132 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1136 o->entry_array.next_entry_array_offset = htole64(q);
1139 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1142 *idx = htole64(hidx + 1);
1147 static int link_entry_into_array_plus_one(JournalFile *f,
1162 *extra = htole64(p);
1166 i = htole64(le64toh(*idx) - 1);
1167 r = link_entry_into_array(f, first, &i, p);
1172 *idx = htole64(le64toh(*idx) + 1);
1176 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1183 p = le64toh(o->entry.items[i].object_offset);
1187 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1191 return link_entry_into_array_plus_one(f,
1192 &o->data.entry_offset,
1193 &o->data.entry_array_offset,
1198 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1206 if (o->object.type != OBJECT_ENTRY)
1209 __sync_synchronize();
1211 /* Link up the entry itself */
1212 r = link_entry_into_array(f,
1213 &f->header->entry_array_offset,
1214 &f->header->n_entries,
1219 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1221 if (f->header->head_entry_realtime == 0)
1222 f->header->head_entry_realtime = o->entry.realtime;
1224 f->header->tail_entry_realtime = o->entry.realtime;
1225 f->header->tail_entry_monotonic = o->entry.monotonic;
1227 f->tail_entry_monotonic_valid = true;
1229 /* Link up the items */
1230 n = journal_file_entry_n_items(o);
1231 for (i = 0; i < n; i++) {
1232 r = journal_file_link_entry_item(f, o, offset, i);
1240 static int journal_file_append_entry_internal(
1242 const dual_timestamp *ts,
1244 const EntryItem items[], unsigned n_items,
1246 Object **ret, uint64_t *offset) {
1253 assert(items || n_items == 0);
1256 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1258 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1262 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1263 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1264 o->entry.realtime = htole64(ts->realtime);
1265 o->entry.monotonic = htole64(ts->monotonic);
1266 o->entry.xor_hash = htole64(xor_hash);
1267 o->entry.boot_id = f->header->boot_id;
1270 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1275 r = journal_file_link_entry(f, o, np);
1288 void journal_file_post_change(JournalFile *f) {
1291 /* inotify() does not receive IN_MODIFY events from file
1292 * accesses done via mmap(). After each access we hence
1293 * trigger IN_MODIFY by truncating the journal file to its
1294 * current size which triggers IN_MODIFY. */
1296 __sync_synchronize();
1298 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1299 log_error("Failed to truncate file to its own size: %m");
1302 static int entry_item_cmp(const void *_a, const void *_b) {
1303 const EntryItem *a = _a, *b = _b;
1305 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1307 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1312 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1316 uint64_t xor_hash = 0;
1317 struct dual_timestamp _ts;
1320 assert(iovec || n_iovec == 0);
1323 dual_timestamp_get(&_ts);
1327 if (f->tail_entry_monotonic_valid &&
1328 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1332 r = journal_file_maybe_append_tag(f, ts->realtime);
1337 /* alloca() can't take 0, hence let's allocate at least one */
1338 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1340 for (i = 0; i < n_iovec; i++) {
1344 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1348 xor_hash ^= le64toh(o->data.hash);
1349 items[i].object_offset = htole64(p);
1350 items[i].hash = o->data.hash;
1353 /* Order by the position on disk, in order to improve seek
1354 * times for rotating media. */
1355 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1357 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1359 journal_file_post_change(f);
1364 typedef struct ChainCacheItem {
1365 uint64_t first; /* the array at the begin of the chain */
1366 uint64_t array; /* the cached array */
1367 uint64_t begin; /* the first item in the cached array */
1368 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1369 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1372 static void chain_cache_put(
1379 uint64_t last_index) {
1382 /* If the chain item to cache for this chain is the
1383 * first one it's not worth caching anything */
1387 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1388 ci = hashmap_steal_first(h);
1390 ci = new(ChainCacheItem, 1);
1397 if (hashmap_put(h, &ci->first, ci) < 0) {
1402 assert(ci->first == first);
1407 ci->last_index = last_index;
1410 static int generic_array_get(
1414 Object **ret, uint64_t *offset) {
1417 uint64_t p = 0, a, t = 0;
1425 /* Try the chain cache first */
1426 ci = hashmap_get(f->chain_cache, &first);
1427 if (ci && i > ci->total) {
1436 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1440 k = journal_file_entry_array_n_items(o);
1442 p = le64toh(o->entry_array.items[i]);
1448 a = le64toh(o->entry_array.next_entry_array_offset);
1454 /* Let's cache this item for the next invocation */
1455 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t, i);
1457 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1470 static int generic_array_get_plus_one(
1475 Object **ret, uint64_t *offset) {
1484 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1497 return generic_array_get(f, first, i-1, ret, offset);
1506 static int generic_array_bisect(
1511 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1512 direction_t direction,
1517 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1518 bool subtract_one = false;
1519 Object *o, *array = NULL;
1524 assert(test_object);
1526 /* Start with the first array in the chain */
1529 ci = hashmap_get(f->chain_cache, &first);
1530 if (ci && n > ci->total) {
1531 /* Ah, we have iterated this bisection array chain
1532 * previously! Let's see if we can skip ahead in the
1533 * chain, as far as the last time. But we can't jump
1534 * backwards in the chain, so let's check that
1537 r = test_object(f, ci->begin, needle);
1541 if (r == TEST_LEFT) {
1542 /* OK, what we are looking for is right of the
1543 * begin of this EntryArray, so let's jump
1544 * straight to previously cached array in the
1550 last_index = ci->last_index;
1555 uint64_t left, right, k, lp;
1557 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1561 k = journal_file_entry_array_n_items(array);
1567 lp = p = le64toh(array->entry_array.items[i]);
1571 r = test_object(f, p, needle);
1575 if (r == TEST_FOUND)
1576 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1578 if (r == TEST_RIGHT) {
1582 if (last_index != (uint64_t) -1) {
1583 assert(last_index <= right);
1585 /* If we cached the last index we
1586 * looked at, let's try to not to jump
1587 * too wildly around and see if we can
1588 * limit the range to look at early to
1589 * the immediate neighbors of the last
1590 * index we looked at. */
1592 if (last_index > 0) {
1593 uint64_t x = last_index - 1;
1595 p = le64toh(array->entry_array.items[x]);
1599 r = test_object(f, p, needle);
1603 if (r == TEST_FOUND)
1604 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1606 if (r == TEST_RIGHT)
1612 if (last_index < right) {
1613 uint64_t y = last_index + 1;
1615 p = le64toh(array->entry_array.items[y]);
1619 r = test_object(f, p, needle);
1623 if (r == TEST_FOUND)
1624 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1626 if (r == TEST_RIGHT)
1632 last_index = (uint64_t) -1;
1636 if (left == right) {
1637 if (direction == DIRECTION_UP)
1638 subtract_one = true;
1644 assert(left < right);
1645 i = (left + right) / 2;
1647 p = le64toh(array->entry_array.items[i]);
1651 r = test_object(f, p, needle);
1655 if (r == TEST_FOUND)
1656 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1658 if (r == TEST_RIGHT)
1666 if (direction == DIRECTION_UP) {
1668 subtract_one = true;
1679 last_index = (uint64_t) -1;
1680 a = le64toh(array->entry_array.next_entry_array_offset);
1686 if (subtract_one && t == 0 && i == 0)
1689 /* Let's cache this item for the next invocation */
1690 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1692 if (subtract_one && i == 0)
1694 else if (subtract_one)
1695 p = le64toh(array->entry_array.items[i-1]);
1697 p = le64toh(array->entry_array.items[i]);
1699 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1710 *idx = t + i + (subtract_one ? -1 : 0);
1716 static int generic_array_bisect_plus_one(
1722 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723 direction_t direction,
1729 bool step_back = false;
1733 assert(test_object);
1738 /* This bisects the array in object 'first', but first checks
1740 r = test_object(f, extra, needle);
1744 if (r == TEST_FOUND)
1745 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1747 /* if we are looking with DIRECTION_UP then we need to first
1748 see if in the actual array there is a matching entry, and
1749 return the last one of that. But if there isn't any we need
1750 to return this one. Hence remember this, and return it
1753 step_back = direction == DIRECTION_UP;
1755 if (r == TEST_RIGHT) {
1756 if (direction == DIRECTION_DOWN)
1762 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1764 if (r == 0 && step_back)
1773 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1789 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1795 else if (p < needle)
1801 int journal_file_move_to_entry_by_offset(
1804 direction_t direction,
1808 return generic_array_bisect(f,
1809 le64toh(f->header->entry_array_offset),
1810 le64toh(f->header->n_entries),
1818 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1825 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1829 if (le64toh(o->entry.seqnum) == needle)
1831 else if (le64toh(o->entry.seqnum) < needle)
1837 int journal_file_move_to_entry_by_seqnum(
1840 direction_t direction,
1844 return generic_array_bisect(f,
1845 le64toh(f->header->entry_array_offset),
1846 le64toh(f->header->n_entries),
1853 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1860 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1864 if (le64toh(o->entry.realtime) == needle)
1866 else if (le64toh(o->entry.realtime) < needle)
1872 int journal_file_move_to_entry_by_realtime(
1875 direction_t direction,
1879 return generic_array_bisect(f,
1880 le64toh(f->header->entry_array_offset),
1881 le64toh(f->header->n_entries),
1883 test_object_realtime,
1888 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1895 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1899 if (le64toh(o->entry.monotonic) == needle)
1901 else if (le64toh(o->entry.monotonic) < needle)
1907 static inline int find_data_object_by_boot_id(
1912 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1914 sd_id128_to_string(boot_id, t + 9);
1915 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1918 int journal_file_move_to_entry_by_monotonic(
1922 direction_t direction,
1931 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1937 return generic_array_bisect_plus_one(f,
1938 le64toh(o->data.entry_offset),
1939 le64toh(o->data.entry_array_offset),
1940 le64toh(o->data.n_entries),
1942 test_object_monotonic,
1947 int journal_file_next_entry(
1949 Object *o, uint64_t p,
1950 direction_t direction,
1951 Object **ret, uint64_t *offset) {
1957 assert(p > 0 || !o);
1959 n = le64toh(f->header->n_entries);
1964 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1966 if (o->object.type != OBJECT_ENTRY)
1969 r = generic_array_bisect(f,
1970 le64toh(f->header->entry_array_offset),
1971 le64toh(f->header->n_entries),
1980 if (direction == DIRECTION_DOWN) {
1993 /* And jump to it */
1994 return generic_array_get(f,
1995 le64toh(f->header->entry_array_offset),
2000 int journal_file_skip_entry(
2002 Object *o, uint64_t p,
2004 Object **ret, uint64_t *offset) {
2013 if (o->object.type != OBJECT_ENTRY)
2016 r = generic_array_bisect(f,
2017 le64toh(f->header->entry_array_offset),
2018 le64toh(f->header->n_entries),
2027 /* Calculate new index */
2029 if ((uint64_t) -skip >= i)
2032 i = i - (uint64_t) -skip;
2034 i += (uint64_t) skip;
2036 n = le64toh(f->header->n_entries);
2043 return generic_array_get(f,
2044 le64toh(f->header->entry_array_offset),
2049 int journal_file_next_entry_for_data(
2051 Object *o, uint64_t p,
2052 uint64_t data_offset,
2053 direction_t direction,
2054 Object **ret, uint64_t *offset) {
2061 assert(p > 0 || !o);
2063 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2067 n = le64toh(d->data.n_entries);
2072 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2074 if (o->object.type != OBJECT_ENTRY)
2077 r = generic_array_bisect_plus_one(f,
2078 le64toh(d->data.entry_offset),
2079 le64toh(d->data.entry_array_offset),
2080 le64toh(d->data.n_entries),
2090 if (direction == DIRECTION_DOWN) {
2104 return generic_array_get_plus_one(f,
2105 le64toh(d->data.entry_offset),
2106 le64toh(d->data.entry_array_offset),
2111 int journal_file_move_to_entry_by_offset_for_data(
2113 uint64_t data_offset,
2115 direction_t direction,
2116 Object **ret, uint64_t *offset) {
2123 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2127 return generic_array_bisect_plus_one(f,
2128 le64toh(d->data.entry_offset),
2129 le64toh(d->data.entry_array_offset),
2130 le64toh(d->data.n_entries),
2137 int journal_file_move_to_entry_by_monotonic_for_data(
2139 uint64_t data_offset,
2142 direction_t direction,
2143 Object **ret, uint64_t *offset) {
2151 /* First, seek by time */
2152 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2158 r = generic_array_bisect_plus_one(f,
2159 le64toh(o->data.entry_offset),
2160 le64toh(o->data.entry_array_offset),
2161 le64toh(o->data.n_entries),
2163 test_object_monotonic,
2169 /* And now, continue seeking until we find an entry that
2170 * exists in both bisection arrays */
2176 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2180 r = generic_array_bisect_plus_one(f,
2181 le64toh(d->data.entry_offset),
2182 le64toh(d->data.entry_array_offset),
2183 le64toh(d->data.n_entries),
2191 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2195 r = generic_array_bisect_plus_one(f,
2196 le64toh(o->data.entry_offset),
2197 le64toh(o->data.entry_array_offset),
2198 le64toh(o->data.n_entries),
2222 int journal_file_move_to_entry_by_seqnum_for_data(
2224 uint64_t data_offset,
2226 direction_t direction,
2227 Object **ret, uint64_t *offset) {
2234 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2238 return generic_array_bisect_plus_one(f,
2239 le64toh(d->data.entry_offset),
2240 le64toh(d->data.entry_array_offset),
2241 le64toh(d->data.n_entries),
2248 int journal_file_move_to_entry_by_realtime_for_data(
2250 uint64_t data_offset,
2252 direction_t direction,
2253 Object **ret, uint64_t *offset) {
2260 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2264 return generic_array_bisect_plus_one(f,
2265 le64toh(d->data.entry_offset),
2266 le64toh(d->data.entry_array_offset),
2267 le64toh(d->data.n_entries),
2269 test_object_realtime,
2274 void journal_file_dump(JournalFile *f) {
2281 journal_file_print_header(f);
2283 p = le64toh(f->header->header_size);
2285 r = journal_file_move_to_object(f, -1, p, &o);
2289 switch (o->object.type) {
2292 printf("Type: OBJECT_UNUSED\n");
2296 printf("Type: OBJECT_DATA\n");
2300 printf("Type: OBJECT_FIELD\n");
2304 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2305 le64toh(o->entry.seqnum),
2306 le64toh(o->entry.monotonic),
2307 le64toh(o->entry.realtime));
2310 case OBJECT_FIELD_HASH_TABLE:
2311 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2314 case OBJECT_DATA_HASH_TABLE:
2315 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2318 case OBJECT_ENTRY_ARRAY:
2319 printf("Type: OBJECT_ENTRY_ARRAY\n");
2323 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2324 le64toh(o->tag.seqnum),
2325 le64toh(o->tag.epoch));
2329 printf("Type: unknown (%u)\n", o->object.type);
2333 if (o->object.flags & OBJECT_COMPRESSED)
2334 printf("Flags: COMPRESSED\n");
2336 if (p == le64toh(f->header->tail_object_offset))
2339 p = p + ALIGN64(le64toh(o->object.size));
2344 log_error("File corrupt");
2347 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2350 x = format_timestamp(buf, l, t);
2356 void journal_file_print_header(JournalFile *f) {
2357 char a[33], b[33], c[33], d[33];
2358 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2360 char bytes[FORMAT_BYTES_MAX];
2364 printf("File Path: %s\n"
2368 "Sequential Number ID: %s\n"
2370 "Compatible Flags:%s%s\n"
2371 "Incompatible Flags:%s%s\n"
2372 "Header size: %"PRIu64"\n"
2373 "Arena size: %"PRIu64"\n"
2374 "Data Hash Table Size: %"PRIu64"\n"
2375 "Field Hash Table Size: %"PRIu64"\n"
2376 "Rotate Suggested: %s\n"
2377 "Head Sequential Number: %"PRIu64"\n"
2378 "Tail Sequential Number: %"PRIu64"\n"
2379 "Head Realtime Timestamp: %s\n"
2380 "Tail Realtime Timestamp: %s\n"
2381 "Tail Monotonic Timestamp: %s\n"
2382 "Objects: %"PRIu64"\n"
2383 "Entry Objects: %"PRIu64"\n",
2385 sd_id128_to_string(f->header->file_id, a),
2386 sd_id128_to_string(f->header->machine_id, b),
2387 sd_id128_to_string(f->header->boot_id, c),
2388 sd_id128_to_string(f->header->seqnum_id, d),
2389 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2390 f->header->state == STATE_ONLINE ? "ONLINE" :
2391 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2392 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2393 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2394 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2395 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2396 le64toh(f->header->header_size),
2397 le64toh(f->header->arena_size),
2398 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2399 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2400 yes_no(journal_file_rotate_suggested(f, 0)),
2401 le64toh(f->header->head_entry_seqnum),
2402 le64toh(f->header->tail_entry_seqnum),
2403 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2404 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2405 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2406 le64toh(f->header->n_objects),
2407 le64toh(f->header->n_entries));
2409 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2410 printf("Data Objects: %"PRIu64"\n"
2411 "Data Hash Table Fill: %.1f%%\n",
2412 le64toh(f->header->n_data),
2413 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2415 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2416 printf("Field Objects: %"PRIu64"\n"
2417 "Field Hash Table Fill: %.1f%%\n",
2418 le64toh(f->header->n_fields),
2419 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2421 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2422 printf("Tag Objects: %"PRIu64"\n",
2423 le64toh(f->header->n_tags));
2424 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2425 printf("Entry Array Objects: %"PRIu64"\n",
2426 le64toh(f->header->n_entry_arrays));
2428 if (fstat(f->fd, &st) >= 0)
2429 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2432 int journal_file_open(
2438 JournalMetrics *metrics,
2439 MMapCache *mmap_cache,
2440 JournalFile *template,
2441 JournalFile **ret) {
2445 bool newly_created = false;
2450 if ((flags & O_ACCMODE) != O_RDONLY &&
2451 (flags & O_ACCMODE) != O_RDWR)
2454 if (!endswith(fname, ".journal") &&
2455 !endswith(fname, ".journal~"))
2458 f = new0(JournalFile, 1);
2466 f->prot = prot_from_flags(flags);
2467 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2469 f->compress = compress;
2476 f->mmap = mmap_cache_ref(mmap_cache);
2478 f->mmap = mmap_cache_new();
2485 f->path = strdup(fname);
2491 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2492 if (!f->chain_cache) {
2497 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2503 if (fstat(f->fd, &f->last_stat) < 0) {
2508 if (f->last_stat.st_size == 0 && f->writable) {
2512 /* Let's attach the creation time to the journal file,
2513 * so that the vacuuming code knows the age of this
2514 * file even if the file might end up corrupted one
2515 * day... Ideally we'd just use the creation time many
2516 * file systems maintain for each file, but there is
2517 * currently no usable API to query this, hence let's
2518 * emulate this via extended attributes. If extended
2519 * attributes are not supported we'll just skip this,
2520 * and rely solely on mtime/atime/ctime of the file.*/
2522 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2523 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2527 /* Try to load the FSPRG state, and if we can't, then
2528 * just don't do sealing */
2530 r = journal_file_fss_load(f);
2536 r = journal_file_init_header(f, template);
2540 if (fstat(f->fd, &f->last_stat) < 0) {
2545 newly_created = true;
2548 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2553 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2554 if (f->header == MAP_FAILED) {
2560 if (!newly_created) {
2561 r = journal_file_verify_header(f);
2567 if (!newly_created && f->writable) {
2568 r = journal_file_fss_load(f);
2576 journal_default_metrics(metrics, f->fd);
2577 f->metrics = *metrics;
2578 } else if (template)
2579 f->metrics = template->metrics;
2581 r = journal_file_refresh_header(f);
2587 r = journal_file_hmac_setup(f);
2592 if (newly_created) {
2593 r = journal_file_setup_field_hash_table(f);
2597 r = journal_file_setup_data_hash_table(f);
2602 r = journal_file_append_first_tag(f);
2608 r = journal_file_map_field_hash_table(f);
2612 r = journal_file_map_data_hash_table(f);
2620 journal_file_close(f);
2625 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2626 _cleanup_free_ char *p = NULL;
2628 JournalFile *old_file, *new_file = NULL;
2636 if (!old_file->writable)
2639 if (!endswith(old_file->path, ".journal"))
2642 l = strlen(old_file->path);
2643 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2644 (int) l - 8, old_file->path,
2645 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2646 le64toh((*f)->header->head_entry_seqnum),
2647 le64toh((*f)->header->head_entry_realtime));
2651 r = rename(old_file->path, p);
2655 old_file->header->state = STATE_ARCHIVED;
2657 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2658 journal_file_close(old_file);
2664 int journal_file_open_reliably(
2670 JournalMetrics *metrics,
2671 MMapCache *mmap_cache,
2672 JournalFile *template,
2673 JournalFile **ret) {
2677 _cleanup_free_ char *p = NULL;
2679 r = journal_file_open(fname, flags, mode, compress, seal,
2680 metrics, mmap_cache, template, ret);
2681 if (r != -EBADMSG && /* corrupted */
2682 r != -ENODATA && /* truncated */
2683 r != -EHOSTDOWN && /* other machine */
2684 r != -EPROTONOSUPPORT && /* incompatible feature */
2685 r != -EBUSY && /* unclean shutdown */
2686 r != -ESHUTDOWN /* already archived */)
2689 if ((flags & O_ACCMODE) == O_RDONLY)
2692 if (!(flags & O_CREAT))
2695 if (!endswith(fname, ".journal"))
2698 /* The file is corrupted. Rotate it away and try it again (but only once) */
2701 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2703 (unsigned long long) now(CLOCK_REALTIME),
2707 r = rename(fname, p);
2711 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2713 return journal_file_open(fname, flags, mode, compress, seal,
2714 metrics, mmap_cache, template, ret);
2717 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2719 uint64_t q, xor_hash = 0;
2732 ts.monotonic = le64toh(o->entry.monotonic);
2733 ts.realtime = le64toh(o->entry.realtime);
2735 n = journal_file_entry_n_items(o);
2736 /* alloca() can't take 0, hence let's allocate at least one */
2737 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2739 for (i = 0; i < n; i++) {
2746 q = le64toh(o->entry.items[i].object_offset);
2747 le_hash = o->entry.items[i].hash;
2749 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2753 if (le_hash != o->data.hash)
2756 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2759 /* We hit the limit on 32bit machines */
2760 if ((uint64_t) t != l)
2763 if (o->object.flags & OBJECT_COMPRESSED) {
2767 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2770 data = from->compress_buffer;
2773 return -EPROTONOSUPPORT;
2776 data = o->data.payload;
2778 r = journal_file_append_data(to, data, l, &u, &h);
2782 xor_hash ^= le64toh(u->data.hash);
2783 items[i].object_offset = htole64(h);
2784 items[i].hash = u->data.hash;
2786 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2791 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2794 void journal_default_metrics(JournalMetrics *m, int fd) {
2795 uint64_t fs_size = 0;
2797 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2802 if (fstatvfs(fd, &ss) >= 0)
2803 fs_size = ss.f_frsize * ss.f_blocks;
2805 if (m->max_use == (uint64_t) -1) {
2808 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2810 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2811 m->max_use = DEFAULT_MAX_USE_UPPER;
2813 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2814 m->max_use = DEFAULT_MAX_USE_LOWER;
2816 m->max_use = DEFAULT_MAX_USE_LOWER;
2818 m->max_use = PAGE_ALIGN(m->max_use);
2820 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2821 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2824 if (m->max_size == (uint64_t) -1) {
2825 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2827 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2828 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2830 m->max_size = PAGE_ALIGN(m->max_size);
2832 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2833 m->max_size = JOURNAL_FILE_SIZE_MIN;
2835 if (m->max_size*2 > m->max_use)
2836 m->max_use = m->max_size*2;
2838 if (m->min_size == (uint64_t) -1)
2839 m->min_size = JOURNAL_FILE_SIZE_MIN;
2841 m->min_size = PAGE_ALIGN(m->min_size);
2843 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2844 m->min_size = JOURNAL_FILE_SIZE_MIN;
2846 if (m->min_size > m->max_size)
2847 m->max_size = m->min_size;
2850 if (m->keep_free == (uint64_t) -1) {
2853 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2855 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2856 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2859 m->keep_free = DEFAULT_KEEP_FREE;
2862 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2863 format_bytes(a, sizeof(a), m->max_use),
2864 format_bytes(b, sizeof(b), m->max_size),
2865 format_bytes(c, sizeof(c), m->min_size),
2866 format_bytes(d, sizeof(d), m->keep_free));
2869 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2874 if (f->header->head_entry_realtime == 0)
2877 *from = le64toh(f->header->head_entry_realtime);
2881 if (f->header->tail_entry_realtime == 0)
2884 *to = le64toh(f->header->tail_entry_realtime);
2890 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2898 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2902 if (le64toh(o->data.n_entries) <= 0)
2906 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2910 *from = le64toh(o->entry.monotonic);
2914 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2918 r = generic_array_get_plus_one(f,
2919 le64toh(o->data.entry_offset),
2920 le64toh(o->data.entry_array_offset),
2921 le64toh(o->data.n_entries)-1,
2926 *to = le64toh(o->entry.monotonic);
2932 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2935 /* If we gained new header fields we gained new features,
2936 * hence suggest a rotation */
2937 if (le64toh(f->header->header_size) < sizeof(Header)) {
2938 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2942 /* Let's check if the hash tables grew over a certain fill
2943 * level (75%, borrowing this value from Java's hash table
2944 * implementation), and if so suggest a rotation. To calculate
2945 * the fill level we need the n_data field, which only exists
2946 * in newer versions. */
2948 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2949 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2950 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2952 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2953 le64toh(f->header->n_data),
2954 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2955 (unsigned long long) f->last_stat.st_size,
2956 f->last_stat.st_size / le64toh(f->header->n_data));
2960 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2961 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2962 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2964 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2965 le64toh(f->header->n_fields),
2966 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2970 /* Are the data objects properly indexed by field objects? */
2971 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2972 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2973 le64toh(f->header->n_data) > 0 &&
2974 le64toh(f->header->n_fields) == 0)
2977 if (max_file_usec > 0) {
2980 h = le64toh(f->header->head_entry_realtime);
2981 t = now(CLOCK_REALTIME);
2983 if (h > 0 && t > h + max_file_usec)