1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
74 static int journal_file_set_online(JournalFile *f) {
80 if (!(f->fd >= 0 && f->header))
83 switch(f->header->state) {
88 f->header->state = STATE_ONLINE;
97 int journal_file_set_offline(JournalFile *f) {
103 if (!(f->fd >= 0 && f->header))
106 if (f->header->state != STATE_ONLINE)
111 f->header->state = STATE_OFFLINE;
118 void journal_file_close(JournalFile *f) {
122 /* Write the final tag */
123 if (f->seal && f->writable)
124 journal_file_append_tag(f);
127 /* Sync everything to disk, before we mark the file offline */
128 if (f->mmap && f->fd >= 0)
129 mmap_cache_close_fd(f->mmap, f->fd);
131 journal_file_set_offline(f);
134 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 close_nointr_nofail(f->fd);
142 mmap_cache_unref(f->mmap);
144 hashmap_free_free(f->chain_cache);
147 free(f->compress_buffer);
152 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153 else if (f->fsprg_state)
154 free(f->fsprg_state);
159 gcry_md_close(f->hmac);
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
173 memcpy(h.signature, HEADER_SIGNATURE, 8);
174 h.header_size = htole64(ALIGN64(sizeof(h)));
176 h.incompatible_flags =
177 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
180 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
182 r = sd_id128_randomize(&h.file_id);
187 h.seqnum_id = template->header->seqnum_id;
188 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
190 h.seqnum_id = h.file_id;
192 k = pwrite(f->fd, &h, sizeof(h), 0);
202 static int journal_file_refresh_header(JournalFile *f) {
208 r = sd_id128_get_machine(&f->header->machine_id);
212 r = sd_id128_get_boot(&boot_id);
216 if (sd_id128_equal(boot_id, f->header->boot_id))
217 f->tail_entry_monotonic_valid = true;
219 f->header->boot_id = boot_id;
221 journal_file_set_online(f);
223 /* Sync the online state to disk */
229 static int journal_file_verify_header(JournalFile *f) {
232 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
235 /* In both read and write mode we refuse to open files with
236 * incompatible flags we don't know */
238 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239 return -EPROTONOSUPPORT;
241 if (f->header->incompatible_flags != 0)
242 return -EPROTONOSUPPORT;
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
249 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250 return -EPROTONOSUPPORT;
252 if (f->header->compatible_flags != 0)
253 return -EPROTONOSUPPORT;
257 if (f->header->state >= _STATE_MAX)
260 /* The first addition was n_data, so check that we are at least this large */
261 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
264 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
267 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
270 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
273 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275 !VALID64(le64toh(f->header->tail_object_offset)) ||
276 !VALID64(le64toh(f->header->entry_array_offset)))
279 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
287 sd_id128_t machine_id;
290 r = sd_id128_get_machine(&machine_id);
294 if (!sd_id128_equal(machine_id, f->header->machine_id))
297 state = f->header->state;
299 if (state == STATE_ONLINE) {
300 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
302 } else if (state == STATE_ARCHIVED)
304 else if (state != STATE_OFFLINE) {
305 log_debug("Journal file %s has unknown state %u.", f->path, state);
310 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
312 f->seal = JOURNAL_HEADER_SEALED(f->header);
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318 uint64_t old_size, new_size, file_size;
323 /* We assume that this file is not sparse, and we know that
324 * for sure, since we always call posix_fallocate()
328 le64toh(f->header->header_size) +
329 le64toh(f->header->arena_size);
331 new_size = PAGE_ALIGN(offset + size);
332 if (new_size < le64toh(f->header->header_size))
333 new_size = le64toh(f->header->header_size);
335 if (new_size <= old_size)
338 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
341 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
344 if (fstatvfs(f->fd, &svfs) >= 0) {
347 available = svfs.f_bfree * svfs.f_bsize;
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
354 if (new_size - old_size > available)
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 /* Increase the file size a bit further than this, so that we
367 * we can create larger memory maps to cache */
368 file_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369 if (file_size > (uint64_t) f->last_stat.st_size) {
370 if (file_size > new_size)
371 ftruncate(f->fd, file_size);
373 if (fstat(f->fd, &f->last_stat) < 0)
377 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
382 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
389 /* Avoid SIGBUS on invalid accesses */
390 if (offset + size > (uint64_t) f->last_stat.st_size) {
391 /* Hmm, out of range? Let's refresh the fstat() data
392 * first, before we trust that check. */
394 if (fstat(f->fd, &f->last_stat) < 0 ||
395 offset + size > (uint64_t) f->last_stat.st_size)
396 return -EADDRNOTAVAIL;
399 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
402 static uint64_t minimum_header_size(Object *o) {
404 static const uint64_t table[] = {
405 [OBJECT_DATA] = sizeof(DataObject),
406 [OBJECT_FIELD] = sizeof(FieldObject),
407 [OBJECT_ENTRY] = sizeof(EntryObject),
408 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411 [OBJECT_TAG] = sizeof(TagObject),
414 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415 return sizeof(ObjectHeader);
417 return table[o->object.type];
420 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
434 /* One context for each type, plus one catch-all for the rest */
435 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
437 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
442 s = le64toh(o->object.size);
444 if (s < sizeof(ObjectHeader))
447 if (o->object.type <= OBJECT_UNUSED)
450 if (s < minimum_header_size(o))
453 if (type > 0 && o->object.type != type)
456 if (s > sizeof(ObjectHeader)) {
457 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
468 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
473 r = le64toh(f->header->tail_entry_seqnum) + 1;
476 /* If an external seqnum counter was passed, we update
477 * both the local and the external one, and set it to
478 * the maximum of both */
486 f->header->tail_entry_seqnum = htole64(r);
488 if (f->header->head_entry_seqnum == 0)
489 f->header->head_entry_seqnum = htole64(r);
494 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
501 assert(type > 0 && type < _OBJECT_TYPE_MAX);
502 assert(size >= sizeof(ObjectHeader));
506 r = journal_file_set_online(f);
510 p = le64toh(f->header->tail_object_offset);
512 p = le64toh(f->header->header_size);
514 r = journal_file_move_to_object(f, -1, p, &tail);
518 p += ALIGN64(le64toh(tail->object.size));
521 r = journal_file_allocate(f, p, size);
525 r = journal_file_move_to(f, type, false, p, size, &t);
532 o->object.type = type;
533 o->object.size = htole64(size);
535 f->header->tail_object_offset = htole64(p);
536 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
544 static int journal_file_setup_data_hash_table(JournalFile *f) {
551 /* We estimate that we need 1 hash table entry per 768 of
552 journal file and we want to make sure we never get beyond
553 75% fill level. Calculate the hash table size for the
554 maximum file size based on these metrics. */
556 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
557 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
558 s = DEFAULT_DATA_HASH_TABLE_SIZE;
560 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
562 r = journal_file_append_object(f,
563 OBJECT_DATA_HASH_TABLE,
564 offsetof(Object, hash_table.items) + s,
569 memset(o->hash_table.items, 0, s);
571 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
572 f->header->data_hash_table_size = htole64(s);
577 static int journal_file_setup_field_hash_table(JournalFile *f) {
584 /* We use a fixed size hash table for the fields as this
585 * number should grow very slowly only */
587 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
588 r = journal_file_append_object(f,
589 OBJECT_FIELD_HASH_TABLE,
590 offsetof(Object, hash_table.items) + s,
595 memset(o->hash_table.items, 0, s);
597 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
598 f->header->field_hash_table_size = htole64(s);
603 static int journal_file_map_data_hash_table(JournalFile *f) {
610 p = le64toh(f->header->data_hash_table_offset);
611 s = le64toh(f->header->data_hash_table_size);
613 r = journal_file_move_to(f,
614 OBJECT_DATA_HASH_TABLE,
621 f->data_hash_table = t;
625 static int journal_file_map_field_hash_table(JournalFile *f) {
632 p = le64toh(f->header->field_hash_table_offset);
633 s = le64toh(f->header->field_hash_table_size);
635 r = journal_file_move_to(f,
636 OBJECT_FIELD_HASH_TABLE,
643 f->field_hash_table = t;
647 static int journal_file_link_field(
660 if (o->object.type != OBJECT_FIELD)
663 /* This might alter the window we are looking at */
665 o->field.next_hash_offset = o->field.head_data_offset = 0;
667 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
668 p = le64toh(f->field_hash_table[h].tail_hash_offset);
670 f->field_hash_table[h].head_hash_offset = htole64(offset);
672 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
676 o->field.next_hash_offset = htole64(offset);
679 f->field_hash_table[h].tail_hash_offset = htole64(offset);
681 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
682 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
687 static int journal_file_link_data(
700 if (o->object.type != OBJECT_DATA)
703 /* This might alter the window we are looking at */
705 o->data.next_hash_offset = o->data.next_field_offset = 0;
706 o->data.entry_offset = o->data.entry_array_offset = 0;
707 o->data.n_entries = 0;
709 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
710 p = le64toh(f->data_hash_table[h].tail_hash_offset);
712 /* Only entry in the hash table is easy */
713 f->data_hash_table[h].head_hash_offset = htole64(offset);
715 /* Move back to the previous data object, to patch in
718 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
722 o->data.next_hash_offset = htole64(offset);
725 f->data_hash_table[h].tail_hash_offset = htole64(offset);
727 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
728 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
733 int journal_file_find_field_object_with_hash(
735 const void *field, uint64_t size, uint64_t hash,
736 Object **ret, uint64_t *offset) {
738 uint64_t p, osize, h;
742 assert(field && size > 0);
744 osize = offsetof(Object, field.payload) + size;
746 if (f->header->field_hash_table_size == 0)
749 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
750 p = le64toh(f->field_hash_table[h].head_hash_offset);
755 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
759 if (le64toh(o->field.hash) == hash &&
760 le64toh(o->object.size) == osize &&
761 memcmp(o->field.payload, field, size) == 0) {
771 p = le64toh(o->field.next_hash_offset);
777 int journal_file_find_field_object(
779 const void *field, uint64_t size,
780 Object **ret, uint64_t *offset) {
785 assert(field && size > 0);
787 hash = hash64(field, size);
789 return journal_file_find_field_object_with_hash(f,
794 int journal_file_find_data_object_with_hash(
796 const void *data, uint64_t size, uint64_t hash,
797 Object **ret, uint64_t *offset) {
799 uint64_t p, osize, h;
803 assert(data || size == 0);
805 osize = offsetof(Object, data.payload) + size;
807 if (f->header->data_hash_table_size == 0)
810 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
811 p = le64toh(f->data_hash_table[h].head_hash_offset);
816 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
820 if (le64toh(o->data.hash) != hash)
823 if (o->object.flags & OBJECT_COMPRESSED) {
827 l = le64toh(o->object.size);
828 if (l <= offsetof(Object, data.payload))
831 l -= offsetof(Object, data.payload);
833 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
837 memcmp(f->compress_buffer, data, size) == 0) {
848 return -EPROTONOSUPPORT;
851 } else if (le64toh(o->object.size) == osize &&
852 memcmp(o->data.payload, data, size) == 0) {
864 p = le64toh(o->data.next_hash_offset);
870 int journal_file_find_data_object(
872 const void *data, uint64_t size,
873 Object **ret, uint64_t *offset) {
878 assert(data || size == 0);
880 hash = hash64(data, size);
882 return journal_file_find_data_object_with_hash(f,
887 static int journal_file_append_field(
889 const void *field, uint64_t size,
890 Object **ret, uint64_t *offset) {
898 assert(field && size > 0);
900 hash = hash64(field, size);
902 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
916 osize = offsetof(Object, field.payload) + size;
917 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
921 o->field.hash = htole64(hash);
922 memcpy(o->field.payload, field, size);
924 r = journal_file_link_field(f, o, p, hash);
928 /* The linking might have altered the window, so let's
929 * refresh our pointer */
930 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
935 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
949 static int journal_file_append_data(
951 const void *data, uint64_t size,
952 Object **ret, uint64_t *offset) {
958 bool compressed = false;
962 assert(data || size == 0);
964 hash = hash64(data, size);
966 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
980 osize = offsetof(Object, data.payload) + size;
981 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
985 o->data.hash = htole64(hash);
989 size >= COMPRESSION_SIZE_THRESHOLD) {
992 compressed = compress_blob(data, size, o->data.payload, &rsize);
995 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
996 o->object.flags |= OBJECT_COMPRESSED;
998 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
1003 if (!compressed && size > 0)
1004 memcpy(o->data.payload, data, size);
1006 r = journal_file_link_data(f, o, p, hash);
1010 /* The linking might have altered the window, so let's
1011 * refresh our pointer */
1012 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1016 eq = memchr(data, '=', size);
1017 if (eq && eq > data) {
1021 /* Create field object ... */
1022 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1026 /* ... and link it in. */
1027 o->data.next_field_offset = fo->field.head_data_offset;
1028 fo->field.head_data_offset = le64toh(p);
1032 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1046 uint64_t journal_file_entry_n_items(Object *o) {
1049 if (o->object.type != OBJECT_ENTRY)
1052 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1055 uint64_t journal_file_entry_array_n_items(Object *o) {
1058 if (o->object.type != OBJECT_ENTRY_ARRAY)
1061 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1064 uint64_t journal_file_hash_table_n_items(Object *o) {
1067 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068 o->object.type != OBJECT_FIELD_HASH_TABLE)
1071 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1074 static int link_entry_into_array(JournalFile *f,
1079 uint64_t n = 0, ap = 0, q, i, a, hidx;
1087 a = le64toh(*first);
1088 i = hidx = le64toh(*idx);
1091 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1095 n = journal_file_entry_array_n_items(o);
1097 o->entry_array.items[i] = htole64(p);
1098 *idx = htole64(hidx + 1);
1104 a = le64toh(o->entry_array.next_entry_array_offset);
1115 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1122 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1127 o->entry_array.items[i] = htole64(p);
1130 *first = htole64(q);
1132 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1136 o->entry_array.next_entry_array_offset = htole64(q);
1139 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1142 *idx = htole64(hidx + 1);
1147 static int link_entry_into_array_plus_one(JournalFile *f,
1162 *extra = htole64(p);
1166 i = htole64(le64toh(*idx) - 1);
1167 r = link_entry_into_array(f, first, &i, p);
1172 *idx = htole64(le64toh(*idx) + 1);
1176 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1183 p = le64toh(o->entry.items[i].object_offset);
1187 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1191 return link_entry_into_array_plus_one(f,
1192 &o->data.entry_offset,
1193 &o->data.entry_array_offset,
1198 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1206 if (o->object.type != OBJECT_ENTRY)
1209 __sync_synchronize();
1211 /* Link up the entry itself */
1212 r = link_entry_into_array(f,
1213 &f->header->entry_array_offset,
1214 &f->header->n_entries,
1219 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1221 if (f->header->head_entry_realtime == 0)
1222 f->header->head_entry_realtime = o->entry.realtime;
1224 f->header->tail_entry_realtime = o->entry.realtime;
1225 f->header->tail_entry_monotonic = o->entry.monotonic;
1227 f->tail_entry_monotonic_valid = true;
1229 /* Link up the items */
1230 n = journal_file_entry_n_items(o);
1231 for (i = 0; i < n; i++) {
1232 r = journal_file_link_entry_item(f, o, offset, i);
1240 static int journal_file_append_entry_internal(
1242 const dual_timestamp *ts,
1244 const EntryItem items[], unsigned n_items,
1246 Object **ret, uint64_t *offset) {
1253 assert(items || n_items == 0);
1256 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1258 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1262 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1263 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1264 o->entry.realtime = htole64(ts->realtime);
1265 o->entry.monotonic = htole64(ts->monotonic);
1266 o->entry.xor_hash = htole64(xor_hash);
1267 o->entry.boot_id = f->header->boot_id;
1270 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1275 r = journal_file_link_entry(f, o, np);
1288 void journal_file_post_change(JournalFile *f) {
1291 /* inotify() does not receive IN_MODIFY events from file
1292 * accesses done via mmap(). After each access we hence
1293 * trigger IN_MODIFY by truncating the journal file to its
1294 * current size which triggers IN_MODIFY. */
1296 __sync_synchronize();
1298 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1299 log_error("Failed to truncate file to its own size: %m");
1302 static int entry_item_cmp(const void *_a, const void *_b) {
1303 const EntryItem *a = _a, *b = _b;
1305 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1307 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1312 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1316 uint64_t xor_hash = 0;
1317 struct dual_timestamp _ts;
1320 assert(iovec || n_iovec == 0);
1323 dual_timestamp_get(&_ts);
1327 if (f->tail_entry_monotonic_valid &&
1328 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1332 r = journal_file_maybe_append_tag(f, ts->realtime);
1337 /* alloca() can't take 0, hence let's allocate at least one */
1338 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1340 for (i = 0; i < n_iovec; i++) {
1344 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1348 xor_hash ^= le64toh(o->data.hash);
1349 items[i].object_offset = htole64(p);
1350 items[i].hash = o->data.hash;
1353 /* Order by the position on disk, in order to improve seek
1354 * times for rotating media. */
1355 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1357 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1359 journal_file_post_change(f);
1364 typedef struct ChainCacheItem {
1365 uint64_t first; /* the array at the begin of the chain */
1366 uint64_t array; /* the cached array */
1367 uint64_t begin; /* the first item in the cached array */
1368 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1369 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1372 static void chain_cache_put(
1379 uint64_t last_index) {
1382 /* If the chain item to cache for this chain is the
1383 * first one it's not worth caching anything */
1387 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1388 ci = hashmap_steal_first(h);
1390 ci = new(ChainCacheItem, 1);
1397 if (hashmap_put(h, &ci->first, ci) < 0) {
1402 assert(ci->first == first);
1407 ci->last_index = last_index;
1410 static int generic_array_get(
1414 Object **ret, uint64_t *offset) {
1417 uint64_t p = 0, a, t = 0;
1425 /* Try the chain cache first */
1426 ci = hashmap_get(f->chain_cache, &first);
1427 if (ci && i > ci->total) {
1436 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1440 k = journal_file_entry_array_n_items(o);
1442 p = le64toh(o->entry_array.items[i]);
1448 a = le64toh(o->entry_array.next_entry_array_offset);
1454 /* Let's cache this item for the next invocation */
1455 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t, i);
1457 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1470 static int generic_array_get_plus_one(
1475 Object **ret, uint64_t *offset) {
1484 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1497 return generic_array_get(f, first, i-1, ret, offset);
1506 static int generic_array_bisect(
1511 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1512 direction_t direction,
1517 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1518 bool subtract_one = false;
1519 Object *o, *array = NULL;
1524 assert(test_object);
1526 /* Start with the first array in the chain */
1529 ci = hashmap_get(f->chain_cache, &first);
1530 if (ci && n > ci->total) {
1531 /* Ah, we have iterated this bisection array chain
1532 * previously! Let's see if we can skip ahead in the
1533 * chain, as far as the last time. But we can't jump
1534 * backwards in the chain, so let's check that
1537 r = test_object(f, ci->begin, needle);
1541 if (r == TEST_LEFT) {
1542 /* OK, what we are looking for is right of the
1543 * begin of this EntryArray, so let's jump
1544 * straight to previously cached array in the
1550 last_index = ci->last_index;
1555 uint64_t left, right, k, lp;
1557 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1561 k = journal_file_entry_array_n_items(array);
1567 lp = p = le64toh(array->entry_array.items[i]);
1571 r = test_object(f, p, needle);
1575 if (r == TEST_FOUND)
1576 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1578 if (r == TEST_RIGHT) {
1582 if (last_index != (uint64_t) -1) {
1583 assert(last_index <= right);
1585 /* If we cached the last index we
1586 * looked at, let's try to not to jump
1587 * too wildly around and see if we can
1588 * limit the range to look at early to
1589 * the immediate neighbors of the last
1590 * index we looked at. */
1592 if (last_index > 0) {
1593 uint64_t x = last_index - 1;
1595 p = le64toh(array->entry_array.items[x]);
1599 r = test_object(f, p, needle);
1603 if (r == TEST_FOUND)
1604 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1606 if (r == TEST_RIGHT)
1612 if (last_index < right) {
1613 uint64_t y = last_index + 1;
1615 p = le64toh(array->entry_array.items[y]);
1619 r = test_object(f, p, needle);
1623 if (r == TEST_FOUND)
1624 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1626 if (r == TEST_RIGHT)
1632 last_index = (uint64_t) -1;
1636 if (left == right) {
1637 if (direction == DIRECTION_UP)
1638 subtract_one = true;
1644 assert(left < right);
1645 i = (left + right) / 2;
1647 p = le64toh(array->entry_array.items[i]);
1651 r = test_object(f, p, needle);
1655 if (r == TEST_FOUND)
1656 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1658 if (r == TEST_RIGHT)
1666 if (direction == DIRECTION_UP) {
1668 subtract_one = true;
1679 last_index = (uint64_t) -1;
1680 a = le64toh(array->entry_array.next_entry_array_offset);
1686 if (subtract_one && t == 0 && i == 0)
1689 /* Let's cache this item for the next invocation */
1690 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t, i + (subtract_one ? -1 : 0));
1692 if (subtract_one && i == 0)
1694 else if (subtract_one)
1695 p = le64toh(array->entry_array.items[i-1]);
1697 p = le64toh(array->entry_array.items[i]);
1699 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1710 *idx = t + i + (subtract_one ? -1 : 0);
1716 static int generic_array_bisect_plus_one(
1722 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723 direction_t direction,
1729 bool step_back = false;
1733 assert(test_object);
1738 /* This bisects the array in object 'first', but first checks
1740 r = test_object(f, extra, needle);
1744 if (r == TEST_FOUND)
1745 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1747 /* if we are looking with DIRECTION_UP then we need to first
1748 see if in the actual array there is a matching entry, and
1749 return the last one of that. But if there isn't any we need
1750 to return this one. Hence remember this, and return it
1753 step_back = direction == DIRECTION_UP;
1755 if (r == TEST_RIGHT) {
1756 if (direction == DIRECTION_DOWN)
1762 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1764 if (r == 0 && step_back)
1773 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1789 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1795 else if (p < needle)
1801 int journal_file_move_to_entry_by_offset(
1804 direction_t direction,
1808 return generic_array_bisect(f,
1809 le64toh(f->header->entry_array_offset),
1810 le64toh(f->header->n_entries),
1818 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1825 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1829 if (le64toh(o->entry.seqnum) == needle)
1831 else if (le64toh(o->entry.seqnum) < needle)
1837 int journal_file_move_to_entry_by_seqnum(
1840 direction_t direction,
1844 return generic_array_bisect(f,
1845 le64toh(f->header->entry_array_offset),
1846 le64toh(f->header->n_entries),
1853 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1860 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1864 if (le64toh(o->entry.realtime) == needle)
1866 else if (le64toh(o->entry.realtime) < needle)
1872 int journal_file_move_to_entry_by_realtime(
1875 direction_t direction,
1879 return generic_array_bisect(f,
1880 le64toh(f->header->entry_array_offset),
1881 le64toh(f->header->n_entries),
1883 test_object_realtime,
1888 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1895 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1899 if (le64toh(o->entry.monotonic) == needle)
1901 else if (le64toh(o->entry.monotonic) < needle)
1907 static inline int find_data_object_by_boot_id(
1912 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1914 sd_id128_to_string(boot_id, t + 9);
1915 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1918 int journal_file_move_to_entry_by_monotonic(
1922 direction_t direction,
1931 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1937 return generic_array_bisect_plus_one(f,
1938 le64toh(o->data.entry_offset),
1939 le64toh(o->data.entry_array_offset),
1940 le64toh(o->data.n_entries),
1942 test_object_monotonic,
1947 int journal_file_next_entry(
1949 Object *o, uint64_t p,
1950 direction_t direction,
1951 Object **ret, uint64_t *offset) {
1957 assert(p > 0 || !o);
1959 n = le64toh(f->header->n_entries);
1964 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1966 if (o->object.type != OBJECT_ENTRY)
1969 r = generic_array_bisect(f,
1970 le64toh(f->header->entry_array_offset),
1971 le64toh(f->header->n_entries),
1980 if (direction == DIRECTION_DOWN) {
1993 /* And jump to it */
1994 return generic_array_get(f,
1995 le64toh(f->header->entry_array_offset),
2000 int journal_file_skip_entry(
2002 Object *o, uint64_t p,
2004 Object **ret, uint64_t *offset) {
2013 if (o->object.type != OBJECT_ENTRY)
2016 r = generic_array_bisect(f,
2017 le64toh(f->header->entry_array_offset),
2018 le64toh(f->header->n_entries),
2027 /* Calculate new index */
2029 if ((uint64_t) -skip >= i)
2032 i = i - (uint64_t) -skip;
2034 i += (uint64_t) skip;
2036 n = le64toh(f->header->n_entries);
2043 return generic_array_get(f,
2044 le64toh(f->header->entry_array_offset),
2049 int journal_file_next_entry_for_data(
2051 Object *o, uint64_t p,
2052 uint64_t data_offset,
2053 direction_t direction,
2054 Object **ret, uint64_t *offset) {
2061 assert(p > 0 || !o);
2063 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2067 n = le64toh(d->data.n_entries);
2072 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2074 if (o->object.type != OBJECT_ENTRY)
2077 r = generic_array_bisect_plus_one(f,
2078 le64toh(d->data.entry_offset),
2079 le64toh(d->data.entry_array_offset),
2080 le64toh(d->data.n_entries),
2090 if (direction == DIRECTION_DOWN) {
2104 return generic_array_get_plus_one(f,
2105 le64toh(d->data.entry_offset),
2106 le64toh(d->data.entry_array_offset),
2111 int journal_file_move_to_entry_by_offset_for_data(
2113 uint64_t data_offset,
2115 direction_t direction,
2116 Object **ret, uint64_t *offset) {
2123 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2127 return generic_array_bisect_plus_one(f,
2128 le64toh(d->data.entry_offset),
2129 le64toh(d->data.entry_array_offset),
2130 le64toh(d->data.n_entries),
2137 int journal_file_move_to_entry_by_monotonic_for_data(
2139 uint64_t data_offset,
2142 direction_t direction,
2143 Object **ret, uint64_t *offset) {
2151 /* First, seek by time */
2152 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2158 r = generic_array_bisect_plus_one(f,
2159 le64toh(o->data.entry_offset),
2160 le64toh(o->data.entry_array_offset),
2161 le64toh(o->data.n_entries),
2163 test_object_monotonic,
2169 /* And now, continue seeking until we find an entry that
2170 * exists in both bisection arrays */
2176 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2180 r = generic_array_bisect_plus_one(f,
2181 le64toh(d->data.entry_offset),
2182 le64toh(d->data.entry_array_offset),
2183 le64toh(d->data.n_entries),
2191 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2195 r = generic_array_bisect_plus_one(f,
2196 le64toh(o->data.entry_offset),
2197 le64toh(o->data.entry_array_offset),
2198 le64toh(o->data.n_entries),
2222 int journal_file_move_to_entry_by_seqnum_for_data(
2224 uint64_t data_offset,
2226 direction_t direction,
2227 Object **ret, uint64_t *offset) {
2234 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2238 return generic_array_bisect_plus_one(f,
2239 le64toh(d->data.entry_offset),
2240 le64toh(d->data.entry_array_offset),
2241 le64toh(d->data.n_entries),
2248 int journal_file_move_to_entry_by_realtime_for_data(
2250 uint64_t data_offset,
2252 direction_t direction,
2253 Object **ret, uint64_t *offset) {
2260 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2264 return generic_array_bisect_plus_one(f,
2265 le64toh(d->data.entry_offset),
2266 le64toh(d->data.entry_array_offset),
2267 le64toh(d->data.n_entries),
2269 test_object_realtime,
2274 void journal_file_dump(JournalFile *f) {
2281 journal_file_print_header(f);
2283 p = le64toh(f->header->header_size);
2285 r = journal_file_move_to_object(f, -1, p, &o);
2289 switch (o->object.type) {
2292 printf("Type: OBJECT_UNUSED\n");
2296 printf("Type: OBJECT_DATA\n");
2300 printf("Type: OBJECT_FIELD\n");
2304 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2305 le64toh(o->entry.seqnum),
2306 le64toh(o->entry.monotonic),
2307 le64toh(o->entry.realtime));
2310 case OBJECT_FIELD_HASH_TABLE:
2311 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2314 case OBJECT_DATA_HASH_TABLE:
2315 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2318 case OBJECT_ENTRY_ARRAY:
2319 printf("Type: OBJECT_ENTRY_ARRAY\n");
2323 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2324 le64toh(o->tag.seqnum),
2325 le64toh(o->tag.epoch));
2329 printf("Type: unknown (%u)\n", o->object.type);
2333 if (o->object.flags & OBJECT_COMPRESSED)
2334 printf("Flags: COMPRESSED\n");
2336 if (p == le64toh(f->header->tail_object_offset))
2339 p = p + ALIGN64(le64toh(o->object.size));
2344 log_error("File corrupt");
2347 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2350 x = format_timestamp(buf, l, t);
2356 void journal_file_print_header(JournalFile *f) {
2357 char a[33], b[33], c[33], d[33];
2358 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2360 char bytes[FORMAT_BYTES_MAX];
2364 printf("File Path: %s\n"
2368 "Sequential Number ID: %s\n"
2370 "Compatible Flags:%s%s\n"
2371 "Incompatible Flags:%s%s\n"
2372 "Header size: %"PRIu64"\n"
2373 "Arena size: %"PRIu64"\n"
2374 "Data Hash Table Size: %"PRIu64"\n"
2375 "Field Hash Table Size: %"PRIu64"\n"
2376 "Rotate Suggested: %s\n"
2377 "Head Sequential Number: %"PRIu64"\n"
2378 "Tail Sequential Number: %"PRIu64"\n"
2379 "Head Realtime Timestamp: %s\n"
2380 "Tail Realtime Timestamp: %s\n"
2381 "Tail Monotonic Timestamp: %s\n"
2382 "Objects: %"PRIu64"\n"
2383 "Entry Objects: %"PRIu64"\n",
2385 sd_id128_to_string(f->header->file_id, a),
2386 sd_id128_to_string(f->header->machine_id, b),
2387 sd_id128_to_string(f->header->boot_id, c),
2388 sd_id128_to_string(f->header->seqnum_id, d),
2389 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2390 f->header->state == STATE_ONLINE ? "ONLINE" :
2391 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2392 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2393 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2394 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2395 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2396 le64toh(f->header->header_size),
2397 le64toh(f->header->arena_size),
2398 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2399 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2400 yes_no(journal_file_rotate_suggested(f, 0)),
2401 le64toh(f->header->head_entry_seqnum),
2402 le64toh(f->header->tail_entry_seqnum),
2403 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2404 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2405 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2406 le64toh(f->header->n_objects),
2407 le64toh(f->header->n_entries));
2409 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2410 printf("Data Objects: %"PRIu64"\n"
2411 "Data Hash Table Fill: %.1f%%\n",
2412 le64toh(f->header->n_data),
2413 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2415 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2416 printf("Field Objects: %"PRIu64"\n"
2417 "Field Hash Table Fill: %.1f%%\n",
2418 le64toh(f->header->n_fields),
2419 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2421 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2422 printf("Tag Objects: %"PRIu64"\n",
2423 le64toh(f->header->n_tags));
2424 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2425 printf("Entry Array Objects: %"PRIu64"\n",
2426 le64toh(f->header->n_entry_arrays));
2428 if (fstat(f->fd, &st) >= 0)
2429 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2432 int journal_file_open(
2438 JournalMetrics *metrics,
2439 MMapCache *mmap_cache,
2440 JournalFile *template,
2441 JournalFile **ret) {
2445 bool newly_created = false;
2450 if ((flags & O_ACCMODE) != O_RDONLY &&
2451 (flags & O_ACCMODE) != O_RDWR)
2454 if (!endswith(fname, ".journal") &&
2455 !endswith(fname, ".journal~"))
2458 f = new0(JournalFile, 1);
2466 f->prot = prot_from_flags(flags);
2467 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2469 f->compress = compress;
2476 f->mmap = mmap_cache_ref(mmap_cache);
2478 f->mmap = mmap_cache_new();
2485 f->path = strdup(fname);
2491 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2492 if (!f->chain_cache) {
2497 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2503 if (fstat(f->fd, &f->last_stat) < 0) {
2508 if (f->last_stat.st_size == 0 && f->writable) {
2512 /* Let's attach the creation time to the journal file,
2513 * so that the vacuuming code knows the age of this
2514 * file even if the file might end up corrupted one
2515 * day... Ideally we'd just use the creation time many
2516 * file systems maintain for each file, but there is
2517 * currently no usable API to query this, hence let's
2518 * emulate this via extended attributes. If extended
2519 * attributes are not supported we'll just skip this,
2520 * and rely solely on mtime/atime/ctime of the file.*/
2522 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2523 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2527 /* Try to load the FSPRG state, and if we can't, then
2528 * just don't do sealing */
2530 r = journal_file_fss_load(f);
2536 r = journal_file_init_header(f, template);
2540 if (fstat(f->fd, &f->last_stat) < 0) {
2545 newly_created = true;
2548 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2553 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2554 if (f->header == MAP_FAILED) {
2560 if (!newly_created) {
2561 r = journal_file_verify_header(f);
2567 if (!newly_created && f->writable) {
2568 r = journal_file_fss_load(f);
2576 journal_default_metrics(metrics, f->fd);
2577 f->metrics = *metrics;
2578 } else if (template)
2579 f->metrics = template->metrics;
2581 r = journal_file_refresh_header(f);
2587 r = journal_file_hmac_setup(f);
2592 if (newly_created) {
2593 r = journal_file_setup_field_hash_table(f);
2597 r = journal_file_setup_data_hash_table(f);
2602 r = journal_file_append_first_tag(f);
2608 r = journal_file_map_field_hash_table(f);
2612 r = journal_file_map_data_hash_table(f);
2620 journal_file_close(f);
2625 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2626 _cleanup_free_ char *p = NULL;
2628 JournalFile *old_file, *new_file = NULL;
2636 if (!old_file->writable)
2639 if (!endswith(old_file->path, ".journal"))
2642 l = strlen(old_file->path);
2643 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2644 (int) l - 8, old_file->path,
2645 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2646 le64toh((*f)->header->head_entry_seqnum),
2647 le64toh((*f)->header->head_entry_realtime));
2651 r = rename(old_file->path, p);
2655 old_file->header->state = STATE_ARCHIVED;
2657 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2658 journal_file_close(old_file);
2664 int journal_file_open_reliably(
2670 JournalMetrics *metrics,
2671 MMapCache *mmap_cache,
2672 JournalFile *template,
2673 JournalFile **ret) {
2677 _cleanup_free_ char *p = NULL;
2679 r = journal_file_open(fname, flags, mode, compress, seal,
2680 metrics, mmap_cache, template, ret);
2681 if (r != -EBADMSG && /* corrupted */
2682 r != -ENODATA && /* truncated */
2683 r != -EHOSTDOWN && /* other machine */
2684 r != -EPROTONOSUPPORT && /* incompatible feature */
2685 r != -EBUSY && /* unclean shutdown */
2686 r != -ESHUTDOWN /* already archived */)
2689 if ((flags & O_ACCMODE) == O_RDONLY)
2692 if (!(flags & O_CREAT))
2695 if (!endswith(fname, ".journal"))
2698 /* The file is corrupted. Rotate it away and try it again (but only once) */
2701 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2703 (unsigned long long) now(CLOCK_REALTIME),
2707 r = rename(fname, p);
2711 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2713 return journal_file_open(fname, flags, mode, compress, seal,
2714 metrics, mmap_cache, template, ret);
2717 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2719 uint64_t q, xor_hash = 0;
2732 ts.monotonic = le64toh(o->entry.monotonic);
2733 ts.realtime = le64toh(o->entry.realtime);
2735 if (to->tail_entry_monotonic_valid &&
2736 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2739 n = journal_file_entry_n_items(o);
2740 items = alloca(sizeof(EntryItem) * n);
2742 for (i = 0; i < n; i++) {
2749 q = le64toh(o->entry.items[i].object_offset);
2750 le_hash = o->entry.items[i].hash;
2752 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2756 if (le_hash != o->data.hash)
2759 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2762 /* We hit the limit on 32bit machines */
2763 if ((uint64_t) t != l)
2766 if (o->object.flags & OBJECT_COMPRESSED) {
2770 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2773 data = from->compress_buffer;
2776 return -EPROTONOSUPPORT;
2779 data = o->data.payload;
2781 r = journal_file_append_data(to, data, l, &u, &h);
2785 xor_hash ^= le64toh(u->data.hash);
2786 items[i].object_offset = htole64(h);
2787 items[i].hash = u->data.hash;
2789 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2794 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2797 void journal_default_metrics(JournalMetrics *m, int fd) {
2798 uint64_t fs_size = 0;
2800 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2805 if (fstatvfs(fd, &ss) >= 0)
2806 fs_size = ss.f_frsize * ss.f_blocks;
2808 if (m->max_use == (uint64_t) -1) {
2811 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2813 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2814 m->max_use = DEFAULT_MAX_USE_UPPER;
2816 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2817 m->max_use = DEFAULT_MAX_USE_LOWER;
2819 m->max_use = DEFAULT_MAX_USE_LOWER;
2821 m->max_use = PAGE_ALIGN(m->max_use);
2823 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2824 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2827 if (m->max_size == (uint64_t) -1) {
2828 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2830 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2831 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2833 m->max_size = PAGE_ALIGN(m->max_size);
2835 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2836 m->max_size = JOURNAL_FILE_SIZE_MIN;
2838 if (m->max_size*2 > m->max_use)
2839 m->max_use = m->max_size*2;
2841 if (m->min_size == (uint64_t) -1)
2842 m->min_size = JOURNAL_FILE_SIZE_MIN;
2844 m->min_size = PAGE_ALIGN(m->min_size);
2846 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2847 m->min_size = JOURNAL_FILE_SIZE_MIN;
2849 if (m->min_size > m->max_size)
2850 m->max_size = m->min_size;
2853 if (m->keep_free == (uint64_t) -1) {
2856 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2858 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2859 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2862 m->keep_free = DEFAULT_KEEP_FREE;
2865 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2866 format_bytes(a, sizeof(a), m->max_use),
2867 format_bytes(b, sizeof(b), m->max_size),
2868 format_bytes(c, sizeof(c), m->min_size),
2869 format_bytes(d, sizeof(d), m->keep_free));
2872 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2877 if (f->header->head_entry_realtime == 0)
2880 *from = le64toh(f->header->head_entry_realtime);
2884 if (f->header->tail_entry_realtime == 0)
2887 *to = le64toh(f->header->tail_entry_realtime);
2893 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2901 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2905 if (le64toh(o->data.n_entries) <= 0)
2909 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2913 *from = le64toh(o->entry.monotonic);
2917 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2921 r = generic_array_get_plus_one(f,
2922 le64toh(o->data.entry_offset),
2923 le64toh(o->data.entry_array_offset),
2924 le64toh(o->data.n_entries)-1,
2929 *to = le64toh(o->entry.monotonic);
2935 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2938 /* If we gained new header fields we gained new features,
2939 * hence suggest a rotation */
2940 if (le64toh(f->header->header_size) < sizeof(Header)) {
2941 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2945 /* Let's check if the hash tables grew over a certain fill
2946 * level (75%, borrowing this value from Java's hash table
2947 * implementation), and if so suggest a rotation. To calculate
2948 * the fill level we need the n_data field, which only exists
2949 * in newer versions. */
2951 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2952 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2953 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2955 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2956 le64toh(f->header->n_data),
2957 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2958 (unsigned long long) f->last_stat.st_size,
2959 f->last_stat.st_size / le64toh(f->header->n_data));
2963 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2964 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2965 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2967 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2968 le64toh(f->header->n_fields),
2969 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2973 /* Are the data objects properly indexed by field objects? */
2974 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2975 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2976 le64toh(f->header->n_data) > 0 &&
2977 le64toh(f->header->n_fields) == 0)
2980 if (max_file_usec > 0) {
2983 h = le64toh(f->header->head_entry_realtime);
2984 t = now(CLOCK_REALTIME);
2986 if (h > 0 && t > h + max_file_usec)