1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
74 static int journal_file_set_online(JournalFile *f) {
80 if (!(f->fd >= 0 && f->header))
83 switch(f->header->state) {
88 f->header->state = STATE_ONLINE;
97 int journal_file_set_offline(JournalFile *f) {
103 if (!(f->fd >= 0 && f->header))
106 if (f->header->state != STATE_ONLINE)
111 f->header->state = STATE_OFFLINE;
118 void journal_file_close(JournalFile *f) {
122 /* Write the final tag */
123 if (f->seal && f->writable)
124 journal_file_append_tag(f);
127 /* Sync everything to disk, before we mark the file offline */
128 if (f->mmap && f->fd >= 0)
129 mmap_cache_close_fd(f->mmap, f->fd);
131 journal_file_set_offline(f);
134 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 close_nointr_nofail(f->fd);
142 mmap_cache_unref(f->mmap);
144 hashmap_free_free(f->chain_cache);
147 free(f->compress_buffer);
152 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153 else if (f->fsprg_state)
154 free(f->fsprg_state);
159 gcry_md_close(f->hmac);
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
173 memcpy(h.signature, HEADER_SIGNATURE, 8);
174 h.header_size = htole64(ALIGN64(sizeof(h)));
176 h.incompatible_flags =
177 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
180 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
182 r = sd_id128_randomize(&h.file_id);
187 h.seqnum_id = template->header->seqnum_id;
188 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
190 h.seqnum_id = h.file_id;
192 k = pwrite(f->fd, &h, sizeof(h), 0);
202 static int journal_file_refresh_header(JournalFile *f) {
208 r = sd_id128_get_machine(&f->header->machine_id);
212 r = sd_id128_get_boot(&boot_id);
216 if (sd_id128_equal(boot_id, f->header->boot_id))
217 f->tail_entry_monotonic_valid = true;
219 f->header->boot_id = boot_id;
221 journal_file_set_online(f);
223 /* Sync the online state to disk */
229 static int journal_file_verify_header(JournalFile *f) {
232 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
235 /* In both read and write mode we refuse to open files with
236 * incompatible flags we don't know */
238 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239 return -EPROTONOSUPPORT;
241 if (f->header->incompatible_flags != 0)
242 return -EPROTONOSUPPORT;
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
249 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250 return -EPROTONOSUPPORT;
252 if (f->header->compatible_flags != 0)
253 return -EPROTONOSUPPORT;
257 if (f->header->state >= _STATE_MAX)
260 /* The first addition was n_data, so check that we are at least this large */
261 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
264 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
267 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
270 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
273 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275 !VALID64(le64toh(f->header->tail_object_offset)) ||
276 !VALID64(le64toh(f->header->entry_array_offset)))
279 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
287 sd_id128_t machine_id;
290 r = sd_id128_get_machine(&machine_id);
294 if (!sd_id128_equal(machine_id, f->header->machine_id))
297 state = f->header->state;
299 if (state == STATE_ONLINE) {
300 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
302 } else if (state == STATE_ARCHIVED)
304 else if (state != STATE_OFFLINE) {
305 log_debug("Journal file %s has unknown state %u.", f->path, state);
310 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
312 f->seal = JOURNAL_HEADER_SEALED(f->header);
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318 uint64_t old_size, new_size;
323 /* We assume that this file is not sparse, and we know that
324 * for sure, since we always call posix_fallocate()
328 le64toh(f->header->header_size) +
329 le64toh(f->header->arena_size);
331 new_size = PAGE_ALIGN(offset + size);
332 if (new_size < le64toh(f->header->header_size))
333 new_size = le64toh(f->header->header_size);
335 if (new_size <= old_size)
338 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
341 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
344 if (fstatvfs(f->fd, &svfs) >= 0) {
347 available = svfs.f_bfree * svfs.f_bsize;
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
354 if (new_size - old_size > available)
359 /* Increase by larger blocks at once */
360 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362 new_size = f->metrics.max_size;
364 /* Note that the glibc fallocate() fallback is very
365 inefficient, hence we try to minimize the allocation area
367 r = posix_fallocate(f->fd, old_size, new_size - old_size);
371 if (fstat(f->fd, &f->last_stat) < 0)
374 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
379 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
386 /* Avoid SIGBUS on invalid accesses */
387 if (offset + size > (uint64_t) f->last_stat.st_size) {
388 /* Hmm, out of range? Let's refresh the fstat() data
389 * first, before we trust that check. */
391 if (fstat(f->fd, &f->last_stat) < 0 ||
392 offset + size > (uint64_t) f->last_stat.st_size)
393 return -EADDRNOTAVAIL;
396 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
399 static uint64_t minimum_header_size(Object *o) {
401 static const uint64_t table[] = {
402 [OBJECT_DATA] = sizeof(DataObject),
403 [OBJECT_FIELD] = sizeof(FieldObject),
404 [OBJECT_ENTRY] = sizeof(EntryObject),
405 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408 [OBJECT_TAG] = sizeof(TagObject),
411 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412 return sizeof(ObjectHeader);
414 return table[o->object.type];
417 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
427 /* Objects may only be located at multiple of 64 bit */
428 if (!VALID64(offset))
431 /* One context for each type, plus one catch-all for the rest */
432 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
434 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
439 s = le64toh(o->object.size);
441 if (s < sizeof(ObjectHeader))
444 if (o->object.type <= OBJECT_UNUSED)
447 if (s < minimum_header_size(o))
450 if (type > 0 && o->object.type != type)
453 if (s > sizeof(ObjectHeader)) {
454 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
465 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
470 r = le64toh(f->header->tail_entry_seqnum) + 1;
473 /* If an external seqnum counter was passed, we update
474 * both the local and the external one, and set it to
475 * the maximum of both */
483 f->header->tail_entry_seqnum = htole64(r);
485 if (f->header->head_entry_seqnum == 0)
486 f->header->head_entry_seqnum = htole64(r);
491 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
498 assert(type > 0 && type < _OBJECT_TYPE_MAX);
499 assert(size >= sizeof(ObjectHeader));
503 r = journal_file_set_online(f);
507 p = le64toh(f->header->tail_object_offset);
509 p = le64toh(f->header->header_size);
511 r = journal_file_move_to_object(f, -1, p, &tail);
515 p += ALIGN64(le64toh(tail->object.size));
518 r = journal_file_allocate(f, p, size);
522 r = journal_file_move_to(f, type, false, p, size, &t);
529 o->object.type = type;
530 o->object.size = htole64(size);
532 f->header->tail_object_offset = htole64(p);
533 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
541 static int journal_file_setup_data_hash_table(JournalFile *f) {
548 /* We estimate that we need 1 hash table entry per 768 of
549 journal file and we want to make sure we never get beyond
550 75% fill level. Calculate the hash table size for the
551 maximum file size based on these metrics. */
553 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
554 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555 s = DEFAULT_DATA_HASH_TABLE_SIZE;
557 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
559 r = journal_file_append_object(f,
560 OBJECT_DATA_HASH_TABLE,
561 offsetof(Object, hash_table.items) + s,
566 memset(o->hash_table.items, 0, s);
568 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569 f->header->data_hash_table_size = htole64(s);
574 static int journal_file_setup_field_hash_table(JournalFile *f) {
581 /* We use a fixed size hash table for the fields as this
582 * number should grow very slowly only */
584 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585 r = journal_file_append_object(f,
586 OBJECT_FIELD_HASH_TABLE,
587 offsetof(Object, hash_table.items) + s,
592 memset(o->hash_table.items, 0, s);
594 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595 f->header->field_hash_table_size = htole64(s);
600 static int journal_file_map_data_hash_table(JournalFile *f) {
607 p = le64toh(f->header->data_hash_table_offset);
608 s = le64toh(f->header->data_hash_table_size);
610 r = journal_file_move_to(f,
611 OBJECT_DATA_HASH_TABLE,
618 f->data_hash_table = t;
622 static int journal_file_map_field_hash_table(JournalFile *f) {
629 p = le64toh(f->header->field_hash_table_offset);
630 s = le64toh(f->header->field_hash_table_size);
632 r = journal_file_move_to(f,
633 OBJECT_FIELD_HASH_TABLE,
640 f->field_hash_table = t;
644 static int journal_file_link_field(
657 if (o->object.type != OBJECT_FIELD)
660 /* This might alter the window we are looking at */
662 o->field.next_hash_offset = o->field.head_data_offset = 0;
664 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665 p = le64toh(f->field_hash_table[h].tail_hash_offset);
667 f->field_hash_table[h].head_hash_offset = htole64(offset);
669 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
673 o->field.next_hash_offset = htole64(offset);
676 f->field_hash_table[h].tail_hash_offset = htole64(offset);
678 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
684 static int journal_file_link_data(
697 if (o->object.type != OBJECT_DATA)
700 /* This might alter the window we are looking at */
702 o->data.next_hash_offset = o->data.next_field_offset = 0;
703 o->data.entry_offset = o->data.entry_array_offset = 0;
704 o->data.n_entries = 0;
706 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
707 p = le64toh(f->data_hash_table[h].tail_hash_offset);
709 /* Only entry in the hash table is easy */
710 f->data_hash_table[h].head_hash_offset = htole64(offset);
712 /* Move back to the previous data object, to patch in
715 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
719 o->data.next_hash_offset = htole64(offset);
722 f->data_hash_table[h].tail_hash_offset = htole64(offset);
724 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
730 int journal_file_find_field_object_with_hash(
732 const void *field, uint64_t size, uint64_t hash,
733 Object **ret, uint64_t *offset) {
735 uint64_t p, osize, h;
739 assert(field && size > 0);
741 osize = offsetof(Object, field.payload) + size;
743 if (f->header->field_hash_table_size == 0)
746 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747 p = le64toh(f->field_hash_table[h].head_hash_offset);
752 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
756 if (le64toh(o->field.hash) == hash &&
757 le64toh(o->object.size) == osize &&
758 memcmp(o->field.payload, field, size) == 0) {
768 p = le64toh(o->field.next_hash_offset);
774 int journal_file_find_field_object(
776 const void *field, uint64_t size,
777 Object **ret, uint64_t *offset) {
782 assert(field && size > 0);
784 hash = hash64(field, size);
786 return journal_file_find_field_object_with_hash(f,
791 int journal_file_find_data_object_with_hash(
793 const void *data, uint64_t size, uint64_t hash,
794 Object **ret, uint64_t *offset) {
796 uint64_t p, osize, h;
800 assert(data || size == 0);
802 osize = offsetof(Object, data.payload) + size;
804 if (f->header->data_hash_table_size == 0)
807 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808 p = le64toh(f->data_hash_table[h].head_hash_offset);
813 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
817 if (le64toh(o->data.hash) != hash)
820 if (o->object.flags & OBJECT_COMPRESSED) {
824 l = le64toh(o->object.size);
825 if (l <= offsetof(Object, data.payload))
828 l -= offsetof(Object, data.payload);
830 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
834 memcmp(f->compress_buffer, data, size) == 0) {
845 return -EPROTONOSUPPORT;
848 } else if (le64toh(o->object.size) == osize &&
849 memcmp(o->data.payload, data, size) == 0) {
861 p = le64toh(o->data.next_hash_offset);
867 int journal_file_find_data_object(
869 const void *data, uint64_t size,
870 Object **ret, uint64_t *offset) {
875 assert(data || size == 0);
877 hash = hash64(data, size);
879 return journal_file_find_data_object_with_hash(f,
884 static int journal_file_append_field(
886 const void *field, uint64_t size,
887 Object **ret, uint64_t *offset) {
895 assert(field && size > 0);
897 hash = hash64(field, size);
899 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
913 osize = offsetof(Object, field.payload) + size;
914 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
918 o->field.hash = htole64(hash);
919 memcpy(o->field.payload, field, size);
921 r = journal_file_link_field(f, o, p, hash);
925 /* The linking might have altered the window, so let's
926 * refresh our pointer */
927 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
932 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
946 static int journal_file_append_data(
948 const void *data, uint64_t size,
949 Object **ret, uint64_t *offset) {
955 bool compressed = false;
959 assert(data || size == 0);
961 hash = hash64(data, size);
963 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
977 osize = offsetof(Object, data.payload) + size;
978 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
982 o->data.hash = htole64(hash);
986 size >= COMPRESSION_SIZE_THRESHOLD) {
989 compressed = compress_blob(data, size, o->data.payload, &rsize);
992 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
993 o->object.flags |= OBJECT_COMPRESSED;
995 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
1000 if (!compressed && size > 0)
1001 memcpy(o->data.payload, data, size);
1003 r = journal_file_link_data(f, o, p, hash);
1007 /* The linking might have altered the window, so let's
1008 * refresh our pointer */
1009 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1016 eq = memchr(data, '=', size);
1017 if (eq && eq > data) {
1021 /* Create field object ... */
1022 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1026 /* ... and link it in. */
1027 o->data.next_field_offset = fo->field.head_data_offset;
1028 fo->field.head_data_offset = le64toh(p);
1032 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1046 uint64_t journal_file_entry_n_items(Object *o) {
1049 if (o->object.type != OBJECT_ENTRY)
1052 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1055 uint64_t journal_file_entry_array_n_items(Object *o) {
1058 if (o->object.type != OBJECT_ENTRY_ARRAY)
1061 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1064 uint64_t journal_file_hash_table_n_items(Object *o) {
1067 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068 o->object.type != OBJECT_FIELD_HASH_TABLE)
1071 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1074 static int link_entry_into_array(JournalFile *f,
1079 uint64_t n = 0, ap = 0, q, i, a, hidx;
1087 a = le64toh(*first);
1088 i = hidx = le64toh(*idx);
1091 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1095 n = journal_file_entry_array_n_items(o);
1097 o->entry_array.items[i] = htole64(p);
1098 *idx = htole64(hidx + 1);
1104 a = le64toh(o->entry_array.next_entry_array_offset);
1115 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1122 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1127 o->entry_array.items[i] = htole64(p);
1130 *first = htole64(q);
1132 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1136 o->entry_array.next_entry_array_offset = htole64(q);
1139 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1142 *idx = htole64(hidx + 1);
1147 static int link_entry_into_array_plus_one(JournalFile *f,
1162 *extra = htole64(p);
1166 i = htole64(le64toh(*idx) - 1);
1167 r = link_entry_into_array(f, first, &i, p);
1172 *idx = htole64(le64toh(*idx) + 1);
1176 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1183 p = le64toh(o->entry.items[i].object_offset);
1187 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1191 return link_entry_into_array_plus_one(f,
1192 &o->data.entry_offset,
1193 &o->data.entry_array_offset,
1198 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1206 if (o->object.type != OBJECT_ENTRY)
1209 __sync_synchronize();
1211 /* Link up the entry itself */
1212 r = link_entry_into_array(f,
1213 &f->header->entry_array_offset,
1214 &f->header->n_entries,
1219 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1221 if (f->header->head_entry_realtime == 0)
1222 f->header->head_entry_realtime = o->entry.realtime;
1224 f->header->tail_entry_realtime = o->entry.realtime;
1225 f->header->tail_entry_monotonic = o->entry.monotonic;
1227 f->tail_entry_monotonic_valid = true;
1229 /* Link up the items */
1230 n = journal_file_entry_n_items(o);
1231 for (i = 0; i < n; i++) {
1232 r = journal_file_link_entry_item(f, o, offset, i);
1240 static int journal_file_append_entry_internal(
1242 const dual_timestamp *ts,
1244 const EntryItem items[], unsigned n_items,
1246 Object **ret, uint64_t *offset) {
1253 assert(items || n_items == 0);
1256 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1258 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1262 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1263 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1264 o->entry.realtime = htole64(ts->realtime);
1265 o->entry.monotonic = htole64(ts->monotonic);
1266 o->entry.xor_hash = htole64(xor_hash);
1267 o->entry.boot_id = f->header->boot_id;
1270 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1275 r = journal_file_link_entry(f, o, np);
1288 void journal_file_post_change(JournalFile *f) {
1291 /* inotify() does not receive IN_MODIFY events from file
1292 * accesses done via mmap(). After each access we hence
1293 * trigger IN_MODIFY by truncating the journal file to its
1294 * current size which triggers IN_MODIFY. */
1296 __sync_synchronize();
1298 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1299 log_error("Failed to truncate file to its own size: %m");
1302 static int entry_item_cmp(const void *_a, const void *_b) {
1303 const EntryItem *a = _a, *b = _b;
1305 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1307 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1312 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1316 uint64_t xor_hash = 0;
1317 struct dual_timestamp _ts;
1320 assert(iovec || n_iovec == 0);
1323 dual_timestamp_get(&_ts);
1327 if (f->tail_entry_monotonic_valid &&
1328 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1332 r = journal_file_maybe_append_tag(f, ts->realtime);
1337 /* alloca() can't take 0, hence let's allocate at least one */
1338 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1340 for (i = 0; i < n_iovec; i++) {
1344 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1348 xor_hash ^= le64toh(o->data.hash);
1349 items[i].object_offset = htole64(p);
1350 items[i].hash = o->data.hash;
1353 /* Order by the position on disk, in order to improve seek
1354 * times for rotating media. */
1355 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1357 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1359 journal_file_post_change(f);
1364 typedef struct ChainCacheItem {
1365 uint64_t first; /* the array at the begin of the chain */
1366 uint64_t array; /* the cached array */
1367 uint64_t begin; /* the first item in the cached array */
1368 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1369 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1372 static void chain_cache_put(
1379 uint64_t last_index) {
1382 /* If the chain item to cache for this chain is the
1383 * first one it's not worth caching anything */
1387 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1388 ci = hashmap_steal_first(h);
1390 ci = new(ChainCacheItem, 1);
1397 if (hashmap_put(h, &ci->first, ci) < 0) {
1402 assert(ci->first == first);
1407 ci->last_index = last_index;
1410 static int generic_array_get(
1414 Object **ret, uint64_t *offset) {
1417 uint64_t p = 0, a, t = 0;
1425 /* Try the chain cache first */
1426 ci = hashmap_get(f->chain_cache, &first);
1427 if (ci && i > ci->total) {
1436 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1440 k = journal_file_entry_array_n_items(o);
1442 p = le64toh(o->entry_array.items[i]);
1448 a = le64toh(o->entry_array.next_entry_array_offset);
1454 /* Let's cache this item for the next invocation */
1455 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t, i);
1457 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1470 static int generic_array_get_plus_one(
1475 Object **ret, uint64_t *offset) {
1484 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1497 return generic_array_get(f, first, i-1, ret, offset);
1506 static int generic_array_bisect(
1511 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1512 direction_t direction,
1517 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1518 bool subtract_one = false;
1519 Object *o, *array = NULL;
1524 assert(test_object);
1526 /* Start with the first array in the chain */
1529 ci = hashmap_get(f->chain_cache, &first);
1530 if (ci && n > ci->total) {
1531 /* Ah, we have iterated this bisection array chain
1532 * previously! Let's see if we can skip ahead in the
1533 * chain, as far as the last time. But we can't jump
1534 * backwards in the chain, so let's check that
1537 r = test_object(f, ci->begin, needle);
1541 if (r == TEST_LEFT) {
1542 /* OK, what we are looking for is right of the
1543 * begin of this EntryArray, so let's jump
1544 * straight to previously cached array in the
1550 last_index = ci->last_index;
1555 uint64_t left, right, k, lp;
1557 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1561 k = journal_file_entry_array_n_items(array);
1567 lp = p = le64toh(array->entry_array.items[i]);
1571 r = test_object(f, p, needle);
1575 if (r == TEST_FOUND)
1576 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1578 if (r == TEST_RIGHT) {
1582 if (last_index != (uint64_t) -1) {
1583 assert(last_index <= right);
1585 /* If we cached the last index we
1586 * looked at, let's try to not to jump
1587 * too wildly around and see if we can
1588 * limit the range to look at early to
1589 * the immediate neighbors of the last
1590 * index we looked at. */
1592 if (last_index > 0) {
1593 uint64_t x = last_index - 1;
1595 p = le64toh(array->entry_array.items[x]);
1599 r = test_object(f, p, needle);
1603 if (r == TEST_FOUND)
1604 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1606 if (r == TEST_RIGHT)
1612 if (last_index < right) {
1613 uint64_t y = last_index + 1;
1615 p = le64toh(array->entry_array.items[y]);
1619 r = test_object(f, p, needle);
1623 if (r == TEST_FOUND)
1624 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1626 if (r == TEST_RIGHT)
1634 if (left == right) {
1635 if (direction == DIRECTION_UP)
1636 subtract_one = true;
1642 assert(left < right);
1643 i = (left + right) / 2;
1645 p = le64toh(array->entry_array.items[i]);
1649 r = test_object(f, p, needle);
1653 if (r == TEST_FOUND)
1654 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1656 if (r == TEST_RIGHT)
1664 if (direction == DIRECTION_UP) {
1666 subtract_one = true;
1677 last_index = (uint64_t) -1;
1678 a = le64toh(array->entry_array.next_entry_array_offset);
1684 if (subtract_one && t == 0 && i == 0)
1687 /* Let's cache this item for the next invocation */
1688 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1690 if (subtract_one && i == 0)
1692 else if (subtract_one)
1693 p = le64toh(array->entry_array.items[i-1]);
1695 p = le64toh(array->entry_array.items[i]);
1697 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1708 *idx = t + i + (subtract_one ? -1 : 0);
1714 static int generic_array_bisect_plus_one(
1720 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1721 direction_t direction,
1727 bool step_back = false;
1731 assert(test_object);
1736 /* This bisects the array in object 'first', but first checks
1738 r = test_object(f, extra, needle);
1742 if (r == TEST_FOUND)
1743 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1745 /* if we are looking with DIRECTION_UP then we need to first
1746 see if in the actual array there is a matching entry, and
1747 return the last one of that. But if there isn't any we need
1748 to return this one. Hence remember this, and return it
1751 step_back = direction == DIRECTION_UP;
1753 if (r == TEST_RIGHT) {
1754 if (direction == DIRECTION_DOWN)
1760 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1762 if (r == 0 && step_back)
1771 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1787 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1793 else if (p < needle)
1799 int journal_file_move_to_entry_by_offset(
1802 direction_t direction,
1806 return generic_array_bisect(f,
1807 le64toh(f->header->entry_array_offset),
1808 le64toh(f->header->n_entries),
1816 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1823 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1827 if (le64toh(o->entry.seqnum) == needle)
1829 else if (le64toh(o->entry.seqnum) < needle)
1835 int journal_file_move_to_entry_by_seqnum(
1838 direction_t direction,
1842 return generic_array_bisect(f,
1843 le64toh(f->header->entry_array_offset),
1844 le64toh(f->header->n_entries),
1851 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1858 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1862 if (le64toh(o->entry.realtime) == needle)
1864 else if (le64toh(o->entry.realtime) < needle)
1870 int journal_file_move_to_entry_by_realtime(
1873 direction_t direction,
1877 return generic_array_bisect(f,
1878 le64toh(f->header->entry_array_offset),
1879 le64toh(f->header->n_entries),
1881 test_object_realtime,
1886 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1893 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1897 if (le64toh(o->entry.monotonic) == needle)
1899 else if (le64toh(o->entry.monotonic) < needle)
1905 static inline int find_data_object_by_boot_id(
1910 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1912 sd_id128_to_string(boot_id, t + 9);
1913 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1916 int journal_file_move_to_entry_by_monotonic(
1920 direction_t direction,
1929 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1935 return generic_array_bisect_plus_one(f,
1936 le64toh(o->data.entry_offset),
1937 le64toh(o->data.entry_array_offset),
1938 le64toh(o->data.n_entries),
1940 test_object_monotonic,
1945 int journal_file_next_entry(
1947 Object *o, uint64_t p,
1948 direction_t direction,
1949 Object **ret, uint64_t *offset) {
1955 assert(p > 0 || !o);
1957 n = le64toh(f->header->n_entries);
1962 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1964 if (o->object.type != OBJECT_ENTRY)
1967 r = generic_array_bisect(f,
1968 le64toh(f->header->entry_array_offset),
1969 le64toh(f->header->n_entries),
1978 if (direction == DIRECTION_DOWN) {
1991 /* And jump to it */
1992 return generic_array_get(f,
1993 le64toh(f->header->entry_array_offset),
1998 int journal_file_skip_entry(
2000 Object *o, uint64_t p,
2002 Object **ret, uint64_t *offset) {
2011 if (o->object.type != OBJECT_ENTRY)
2014 r = generic_array_bisect(f,
2015 le64toh(f->header->entry_array_offset),
2016 le64toh(f->header->n_entries),
2025 /* Calculate new index */
2027 if ((uint64_t) -skip >= i)
2030 i = i - (uint64_t) -skip;
2032 i += (uint64_t) skip;
2034 n = le64toh(f->header->n_entries);
2041 return generic_array_get(f,
2042 le64toh(f->header->entry_array_offset),
2047 int journal_file_next_entry_for_data(
2049 Object *o, uint64_t p,
2050 uint64_t data_offset,
2051 direction_t direction,
2052 Object **ret, uint64_t *offset) {
2059 assert(p > 0 || !o);
2061 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2065 n = le64toh(d->data.n_entries);
2070 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2072 if (o->object.type != OBJECT_ENTRY)
2075 r = generic_array_bisect_plus_one(f,
2076 le64toh(d->data.entry_offset),
2077 le64toh(d->data.entry_array_offset),
2078 le64toh(d->data.n_entries),
2088 if (direction == DIRECTION_DOWN) {
2102 return generic_array_get_plus_one(f,
2103 le64toh(d->data.entry_offset),
2104 le64toh(d->data.entry_array_offset),
2109 int journal_file_move_to_entry_by_offset_for_data(
2111 uint64_t data_offset,
2113 direction_t direction,
2114 Object **ret, uint64_t *offset) {
2121 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2125 return generic_array_bisect_plus_one(f,
2126 le64toh(d->data.entry_offset),
2127 le64toh(d->data.entry_array_offset),
2128 le64toh(d->data.n_entries),
2135 int journal_file_move_to_entry_by_monotonic_for_data(
2137 uint64_t data_offset,
2140 direction_t direction,
2141 Object **ret, uint64_t *offset) {
2149 /* First, seek by time */
2150 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2156 r = generic_array_bisect_plus_one(f,
2157 le64toh(o->data.entry_offset),
2158 le64toh(o->data.entry_array_offset),
2159 le64toh(o->data.n_entries),
2161 test_object_monotonic,
2167 /* And now, continue seeking until we find an entry that
2168 * exists in both bisection arrays */
2174 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2178 r = generic_array_bisect_plus_one(f,
2179 le64toh(d->data.entry_offset),
2180 le64toh(d->data.entry_array_offset),
2181 le64toh(d->data.n_entries),
2189 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2193 r = generic_array_bisect_plus_one(f,
2194 le64toh(o->data.entry_offset),
2195 le64toh(o->data.entry_array_offset),
2196 le64toh(o->data.n_entries),
2220 int journal_file_move_to_entry_by_seqnum_for_data(
2222 uint64_t data_offset,
2224 direction_t direction,
2225 Object **ret, uint64_t *offset) {
2232 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2236 return generic_array_bisect_plus_one(f,
2237 le64toh(d->data.entry_offset),
2238 le64toh(d->data.entry_array_offset),
2239 le64toh(d->data.n_entries),
2246 int journal_file_move_to_entry_by_realtime_for_data(
2248 uint64_t data_offset,
2250 direction_t direction,
2251 Object **ret, uint64_t *offset) {
2258 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2262 return generic_array_bisect_plus_one(f,
2263 le64toh(d->data.entry_offset),
2264 le64toh(d->data.entry_array_offset),
2265 le64toh(d->data.n_entries),
2267 test_object_realtime,
2272 void journal_file_dump(JournalFile *f) {
2279 journal_file_print_header(f);
2281 p = le64toh(f->header->header_size);
2283 r = journal_file_move_to_object(f, -1, p, &o);
2287 switch (o->object.type) {
2290 printf("Type: OBJECT_UNUSED\n");
2294 printf("Type: OBJECT_DATA\n");
2298 printf("Type: OBJECT_FIELD\n");
2302 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2303 le64toh(o->entry.seqnum),
2304 le64toh(o->entry.monotonic),
2305 le64toh(o->entry.realtime));
2308 case OBJECT_FIELD_HASH_TABLE:
2309 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2312 case OBJECT_DATA_HASH_TABLE:
2313 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2316 case OBJECT_ENTRY_ARRAY:
2317 printf("Type: OBJECT_ENTRY_ARRAY\n");
2321 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2322 le64toh(o->tag.seqnum),
2323 le64toh(o->tag.epoch));
2327 printf("Type: unknown (%u)\n", o->object.type);
2331 if (o->object.flags & OBJECT_COMPRESSED)
2332 printf("Flags: COMPRESSED\n");
2334 if (p == le64toh(f->header->tail_object_offset))
2337 p = p + ALIGN64(le64toh(o->object.size));
2342 log_error("File corrupt");
2345 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2348 x = format_timestamp(buf, l, t);
2354 void journal_file_print_header(JournalFile *f) {
2355 char a[33], b[33], c[33], d[33];
2356 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2358 char bytes[FORMAT_BYTES_MAX];
2362 printf("File Path: %s\n"
2366 "Sequential Number ID: %s\n"
2368 "Compatible Flags:%s%s\n"
2369 "Incompatible Flags:%s%s\n"
2370 "Header size: %"PRIu64"\n"
2371 "Arena size: %"PRIu64"\n"
2372 "Data Hash Table Size: %"PRIu64"\n"
2373 "Field Hash Table Size: %"PRIu64"\n"
2374 "Rotate Suggested: %s\n"
2375 "Head Sequential Number: %"PRIu64"\n"
2376 "Tail Sequential Number: %"PRIu64"\n"
2377 "Head Realtime Timestamp: %s\n"
2378 "Tail Realtime Timestamp: %s\n"
2379 "Tail Monotonic Timestamp: %s\n"
2380 "Objects: %"PRIu64"\n"
2381 "Entry Objects: %"PRIu64"\n",
2383 sd_id128_to_string(f->header->file_id, a),
2384 sd_id128_to_string(f->header->machine_id, b),
2385 sd_id128_to_string(f->header->boot_id, c),
2386 sd_id128_to_string(f->header->seqnum_id, d),
2387 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2388 f->header->state == STATE_ONLINE ? "ONLINE" :
2389 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2390 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2391 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2392 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2393 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2394 le64toh(f->header->header_size),
2395 le64toh(f->header->arena_size),
2396 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2397 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2398 yes_no(journal_file_rotate_suggested(f, 0)),
2399 le64toh(f->header->head_entry_seqnum),
2400 le64toh(f->header->tail_entry_seqnum),
2401 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2402 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2403 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2404 le64toh(f->header->n_objects),
2405 le64toh(f->header->n_entries));
2407 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2408 printf("Data Objects: %"PRIu64"\n"
2409 "Data Hash Table Fill: %.1f%%\n",
2410 le64toh(f->header->n_data),
2411 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2413 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2414 printf("Field Objects: %"PRIu64"\n"
2415 "Field Hash Table Fill: %.1f%%\n",
2416 le64toh(f->header->n_fields),
2417 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2419 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2420 printf("Tag Objects: %"PRIu64"\n",
2421 le64toh(f->header->n_tags));
2422 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2423 printf("Entry Array Objects: %"PRIu64"\n",
2424 le64toh(f->header->n_entry_arrays));
2426 if (fstat(f->fd, &st) >= 0)
2427 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2430 int journal_file_open(
2436 JournalMetrics *metrics,
2437 MMapCache *mmap_cache,
2438 JournalFile *template,
2439 JournalFile **ret) {
2443 bool newly_created = false;
2448 if ((flags & O_ACCMODE) != O_RDONLY &&
2449 (flags & O_ACCMODE) != O_RDWR)
2452 if (!endswith(fname, ".journal") &&
2453 !endswith(fname, ".journal~"))
2456 f = new0(JournalFile, 1);
2464 f->prot = prot_from_flags(flags);
2465 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2467 f->compress = compress;
2474 f->mmap = mmap_cache_ref(mmap_cache);
2476 f->mmap = mmap_cache_new();
2483 f->path = strdup(fname);
2489 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2490 if (!f->chain_cache) {
2495 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2501 if (fstat(f->fd, &f->last_stat) < 0) {
2506 if (f->last_stat.st_size == 0 && f->writable) {
2510 /* Let's attach the creation time to the journal file,
2511 * so that the vacuuming code knows the age of this
2512 * file even if the file might end up corrupted one
2513 * day... Ideally we'd just use the creation time many
2514 * file systems maintain for each file, but there is
2515 * currently no usable API to query this, hence let's
2516 * emulate this via extended attributes. If extended
2517 * attributes are not supported we'll just skip this,
2518 * and rely solely on mtime/atime/ctime of the file.*/
2520 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2521 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2525 /* Try to load the FSPRG state, and if we can't, then
2526 * just don't do sealing */
2528 r = journal_file_fss_load(f);
2534 r = journal_file_init_header(f, template);
2538 if (fstat(f->fd, &f->last_stat) < 0) {
2543 newly_created = true;
2546 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2551 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2552 if (f->header == MAP_FAILED) {
2558 if (!newly_created) {
2559 r = journal_file_verify_header(f);
2565 if (!newly_created && f->writable) {
2566 r = journal_file_fss_load(f);
2574 journal_default_metrics(metrics, f->fd);
2575 f->metrics = *metrics;
2576 } else if (template)
2577 f->metrics = template->metrics;
2579 r = journal_file_refresh_header(f);
2585 r = journal_file_hmac_setup(f);
2590 if (newly_created) {
2591 r = journal_file_setup_field_hash_table(f);
2595 r = journal_file_setup_data_hash_table(f);
2600 r = journal_file_append_first_tag(f);
2606 r = journal_file_map_field_hash_table(f);
2610 r = journal_file_map_data_hash_table(f);
2618 journal_file_close(f);
2623 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2624 _cleanup_free_ char *p = NULL;
2626 JournalFile *old_file, *new_file = NULL;
2634 if (!old_file->writable)
2637 if (!endswith(old_file->path, ".journal"))
2640 l = strlen(old_file->path);
2641 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2642 (int) l - 8, old_file->path,
2643 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2644 le64toh((*f)->header->head_entry_seqnum),
2645 le64toh((*f)->header->head_entry_realtime));
2649 r = rename(old_file->path, p);
2653 old_file->header->state = STATE_ARCHIVED;
2655 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2656 journal_file_close(old_file);
2662 int journal_file_open_reliably(
2668 JournalMetrics *metrics,
2669 MMapCache *mmap_cache,
2670 JournalFile *template,
2671 JournalFile **ret) {
2675 _cleanup_free_ char *p = NULL;
2677 r = journal_file_open(fname, flags, mode, compress, seal,
2678 metrics, mmap_cache, template, ret);
2679 if (r != -EBADMSG && /* corrupted */
2680 r != -ENODATA && /* truncated */
2681 r != -EHOSTDOWN && /* other machine */
2682 r != -EPROTONOSUPPORT && /* incompatible feature */
2683 r != -EBUSY && /* unclean shutdown */
2684 r != -ESHUTDOWN /* already archived */)
2687 if ((flags & O_ACCMODE) == O_RDONLY)
2690 if (!(flags & O_CREAT))
2693 if (!endswith(fname, ".journal"))
2696 /* The file is corrupted. Rotate it away and try it again (but only once) */
2699 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2701 (unsigned long long) now(CLOCK_REALTIME),
2705 r = rename(fname, p);
2709 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2711 return journal_file_open(fname, flags, mode, compress, seal,
2712 metrics, mmap_cache, template, ret);
2715 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2717 uint64_t q, xor_hash = 0;
2730 ts.monotonic = le64toh(o->entry.monotonic);
2731 ts.realtime = le64toh(o->entry.realtime);
2733 n = journal_file_entry_n_items(o);
2734 /* alloca() can't take 0, hence let's allocate at least one */
2735 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2737 for (i = 0; i < n; i++) {
2744 q = le64toh(o->entry.items[i].object_offset);
2745 le_hash = o->entry.items[i].hash;
2747 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2751 if (le_hash != o->data.hash)
2754 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2757 /* We hit the limit on 32bit machines */
2758 if ((uint64_t) t != l)
2761 if (o->object.flags & OBJECT_COMPRESSED) {
2765 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2768 data = from->compress_buffer;
2771 return -EPROTONOSUPPORT;
2774 data = o->data.payload;
2776 r = journal_file_append_data(to, data, l, &u, &h);
2780 xor_hash ^= le64toh(u->data.hash);
2781 items[i].object_offset = htole64(h);
2782 items[i].hash = u->data.hash;
2784 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2789 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2792 void journal_default_metrics(JournalMetrics *m, int fd) {
2793 uint64_t fs_size = 0;
2795 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2800 if (fstatvfs(fd, &ss) >= 0)
2801 fs_size = ss.f_frsize * ss.f_blocks;
2803 if (m->max_use == (uint64_t) -1) {
2806 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2808 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2809 m->max_use = DEFAULT_MAX_USE_UPPER;
2811 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2812 m->max_use = DEFAULT_MAX_USE_LOWER;
2814 m->max_use = DEFAULT_MAX_USE_LOWER;
2816 m->max_use = PAGE_ALIGN(m->max_use);
2818 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2819 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2822 if (m->max_size == (uint64_t) -1) {
2823 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2825 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2826 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2828 m->max_size = PAGE_ALIGN(m->max_size);
2830 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2831 m->max_size = JOURNAL_FILE_SIZE_MIN;
2833 if (m->max_size*2 > m->max_use)
2834 m->max_use = m->max_size*2;
2836 if (m->min_size == (uint64_t) -1)
2837 m->min_size = JOURNAL_FILE_SIZE_MIN;
2839 m->min_size = PAGE_ALIGN(m->min_size);
2841 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2842 m->min_size = JOURNAL_FILE_SIZE_MIN;
2844 if (m->min_size > m->max_size)
2845 m->max_size = m->min_size;
2848 if (m->keep_free == (uint64_t) -1) {
2851 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2853 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2854 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2857 m->keep_free = DEFAULT_KEEP_FREE;
2860 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2861 format_bytes(a, sizeof(a), m->max_use),
2862 format_bytes(b, sizeof(b), m->max_size),
2863 format_bytes(c, sizeof(c), m->min_size),
2864 format_bytes(d, sizeof(d), m->keep_free));
2867 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2872 if (f->header->head_entry_realtime == 0)
2875 *from = le64toh(f->header->head_entry_realtime);
2879 if (f->header->tail_entry_realtime == 0)
2882 *to = le64toh(f->header->tail_entry_realtime);
2888 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2896 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2900 if (le64toh(o->data.n_entries) <= 0)
2904 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2908 *from = le64toh(o->entry.monotonic);
2912 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2916 r = generic_array_get_plus_one(f,
2917 le64toh(o->data.entry_offset),
2918 le64toh(o->data.entry_array_offset),
2919 le64toh(o->data.n_entries)-1,
2924 *to = le64toh(o->entry.monotonic);
2930 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2933 /* If we gained new header fields we gained new features,
2934 * hence suggest a rotation */
2935 if (le64toh(f->header->header_size) < sizeof(Header)) {
2936 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2940 /* Let's check if the hash tables grew over a certain fill
2941 * level (75%, borrowing this value from Java's hash table
2942 * implementation), and if so suggest a rotation. To calculate
2943 * the fill level we need the n_data field, which only exists
2944 * in newer versions. */
2946 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2947 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2948 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2950 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2951 le64toh(f->header->n_data),
2952 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2953 (unsigned long long) f->last_stat.st_size,
2954 f->last_stat.st_size / le64toh(f->header->n_data));
2958 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2959 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2960 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2962 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2963 le64toh(f->header->n_fields),
2964 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2968 /* Are the data objects properly indexed by field objects? */
2969 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2970 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2971 le64toh(f->header->n_data) > 0 &&
2972 le64toh(f->header->n_fields) == 0)
2975 if (max_file_usec > 0) {
2978 h = le64toh(f->header->head_entry_realtime);
2979 t = now(CLOCK_REALTIME);
2981 if (h > 0 && t > h + max_file_usec)