1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
74 static int journal_file_set_online(JournalFile *f) {
80 if (!(f->fd >= 0 && f->header))
83 switch(f->header->state) {
88 f->header->state = STATE_ONLINE;
97 int journal_file_set_offline(JournalFile *f) {
103 if (!(f->fd >= 0 && f->header))
106 if (f->header->state != STATE_ONLINE)
111 f->header->state = STATE_OFFLINE;
118 void journal_file_close(JournalFile *f) {
122 /* Write the final tag */
123 if (f->seal && f->writable)
124 journal_file_append_tag(f);
127 /* Sync everything to disk, before we mark the file offline */
128 if (f->mmap && f->fd >= 0)
129 mmap_cache_close_fd(f->mmap, f->fd);
131 journal_file_set_offline(f);
134 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 close_nointr_nofail(f->fd);
142 mmap_cache_unref(f->mmap);
144 hashmap_free_free(f->chain_cache);
147 free(f->compress_buffer);
152 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
153 else if (f->fsprg_state)
154 free(f->fsprg_state);
159 gcry_md_close(f->hmac);
165 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
173 memcpy(h.signature, HEADER_SIGNATURE, 8);
174 h.header_size = htole64(ALIGN64(sizeof(h)));
176 h.incompatible_flags =
177 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
180 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
182 r = sd_id128_randomize(&h.file_id);
187 h.seqnum_id = template->header->seqnum_id;
188 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
190 h.seqnum_id = h.file_id;
192 k = pwrite(f->fd, &h, sizeof(h), 0);
202 static int journal_file_refresh_header(JournalFile *f) {
208 r = sd_id128_get_machine(&f->header->machine_id);
212 r = sd_id128_get_boot(&boot_id);
216 if (sd_id128_equal(boot_id, f->header->boot_id))
217 f->tail_entry_monotonic_valid = true;
219 f->header->boot_id = boot_id;
221 journal_file_set_online(f);
223 /* Sync the online state to disk */
229 static int journal_file_verify_header(JournalFile *f) {
232 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
235 /* In both read and write mode we refuse to open files with
236 * incompatible flags we don't know */
238 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
239 return -EPROTONOSUPPORT;
241 if (f->header->incompatible_flags != 0)
242 return -EPROTONOSUPPORT;
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
249 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
250 return -EPROTONOSUPPORT;
252 if (f->header->compatible_flags != 0)
253 return -EPROTONOSUPPORT;
257 if (f->header->state >= _STATE_MAX)
260 /* The first addition was n_data, so check that we are at least this large */
261 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
264 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
267 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
270 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
273 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275 !VALID64(le64toh(f->header->tail_object_offset)) ||
276 !VALID64(le64toh(f->header->entry_array_offset)))
279 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
287 sd_id128_t machine_id;
290 r = sd_id128_get_machine(&machine_id);
294 if (!sd_id128_equal(machine_id, f->header->machine_id))
297 state = f->header->state;
299 if (state == STATE_ONLINE) {
300 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
302 } else if (state == STATE_ARCHIVED)
304 else if (state != STATE_OFFLINE) {
305 log_debug("Journal file %s has unknown state %u.", f->path, state);
310 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
312 f->seal = JOURNAL_HEADER_SEALED(f->header);
317 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
318 uint64_t old_size, new_size, file_size;
323 /* We assume that this file is not sparse, and we know that
324 * for sure, since we always call posix_fallocate()
328 le64toh(f->header->header_size) +
329 le64toh(f->header->arena_size);
331 new_size = PAGE_ALIGN(offset + size);
332 if (new_size < le64toh(f->header->header_size))
333 new_size = le64toh(f->header->header_size);
335 if (new_size <= old_size)
338 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
341 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
344 if (fstatvfs(f->fd, &svfs) >= 0) {
347 available = svfs.f_bfree * svfs.f_bsize;
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
354 if (new_size - old_size > available)
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 /* Increase the file size a bit further than this, so that we
367 * we can create larger memory maps to cache */
368 file_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369 if (file_size > (uint64_t) f->last_stat.st_size) {
370 if (file_size > new_size)
371 ftruncate(f->fd, file_size);
373 if (fstat(f->fd, &f->last_stat) < 0)
377 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
382 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
389 /* Avoid SIGBUS on invalid accesses */
390 if (offset + size > (uint64_t) f->last_stat.st_size) {
391 /* Hmm, out of range? Let's refresh the fstat() data
392 * first, before we trust that check. */
394 if (fstat(f->fd, &f->last_stat) < 0 ||
395 offset + size > (uint64_t) f->last_stat.st_size)
396 return -EADDRNOTAVAIL;
399 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
402 static uint64_t minimum_header_size(Object *o) {
404 static uint64_t table[] = {
405 [OBJECT_DATA] = sizeof(DataObject),
406 [OBJECT_FIELD] = sizeof(FieldObject),
407 [OBJECT_ENTRY] = sizeof(EntryObject),
408 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411 [OBJECT_TAG] = sizeof(TagObject),
414 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415 return sizeof(ObjectHeader);
417 return table[o->object.type];
420 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
434 /* One context for each type, plus one catch-all for the rest */
435 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
437 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
442 s = le64toh(o->object.size);
444 if (s < sizeof(ObjectHeader))
447 if (o->object.type <= OBJECT_UNUSED)
450 if (s < minimum_header_size(o))
453 if (type > 0 && o->object.type != type)
456 if (s > sizeof(ObjectHeader)) {
457 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
468 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
473 r = le64toh(f->header->tail_entry_seqnum) + 1;
476 /* If an external seqnum counter was passed, we update
477 * both the local and the external one, and set it to
478 * the maximum of both */
486 f->header->tail_entry_seqnum = htole64(r);
488 if (f->header->head_entry_seqnum == 0)
489 f->header->head_entry_seqnum = htole64(r);
494 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
501 assert(type > 0 && type < _OBJECT_TYPE_MAX);
502 assert(size >= sizeof(ObjectHeader));
506 r = journal_file_set_online(f);
510 p = le64toh(f->header->tail_object_offset);
512 p = le64toh(f->header->header_size);
514 r = journal_file_move_to_object(f, -1, p, &tail);
518 p += ALIGN64(le64toh(tail->object.size));
521 r = journal_file_allocate(f, p, size);
525 r = journal_file_move_to(f, type, false, p, size, &t);
532 o->object.type = type;
533 o->object.size = htole64(size);
535 f->header->tail_object_offset = htole64(p);
536 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
544 static int journal_file_setup_data_hash_table(JournalFile *f) {
551 /* We estimate that we need 1 hash table entry per 768 of
552 journal file and we want to make sure we never get beyond
553 75% fill level. Calculate the hash table size for the
554 maximum file size based on these metrics. */
556 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
557 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
558 s = DEFAULT_DATA_HASH_TABLE_SIZE;
560 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
562 r = journal_file_append_object(f,
563 OBJECT_DATA_HASH_TABLE,
564 offsetof(Object, hash_table.items) + s,
569 memset(o->hash_table.items, 0, s);
571 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
572 f->header->data_hash_table_size = htole64(s);
577 static int journal_file_setup_field_hash_table(JournalFile *f) {
584 /* We use a fixed size hash table for the fields as this
585 * number should grow very slowly only */
587 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
588 r = journal_file_append_object(f,
589 OBJECT_FIELD_HASH_TABLE,
590 offsetof(Object, hash_table.items) + s,
595 memset(o->hash_table.items, 0, s);
597 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
598 f->header->field_hash_table_size = htole64(s);
603 static int journal_file_map_data_hash_table(JournalFile *f) {
610 p = le64toh(f->header->data_hash_table_offset);
611 s = le64toh(f->header->data_hash_table_size);
613 r = journal_file_move_to(f,
614 OBJECT_DATA_HASH_TABLE,
621 f->data_hash_table = t;
625 static int journal_file_map_field_hash_table(JournalFile *f) {
632 p = le64toh(f->header->field_hash_table_offset);
633 s = le64toh(f->header->field_hash_table_size);
635 r = journal_file_move_to(f,
636 OBJECT_FIELD_HASH_TABLE,
643 f->field_hash_table = t;
647 static int journal_file_link_field(
660 if (o->object.type != OBJECT_FIELD)
663 /* This might alter the window we are looking at */
665 o->field.next_hash_offset = o->field.head_data_offset = 0;
667 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
668 p = le64toh(f->field_hash_table[h].tail_hash_offset);
670 f->field_hash_table[h].head_hash_offset = htole64(offset);
672 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
676 o->field.next_hash_offset = htole64(offset);
679 f->field_hash_table[h].tail_hash_offset = htole64(offset);
681 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
682 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
687 static int journal_file_link_data(
700 if (o->object.type != OBJECT_DATA)
703 /* This might alter the window we are looking at */
705 o->data.next_hash_offset = o->data.next_field_offset = 0;
706 o->data.entry_offset = o->data.entry_array_offset = 0;
707 o->data.n_entries = 0;
709 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
710 p = le64toh(f->data_hash_table[h].tail_hash_offset);
712 /* Only entry in the hash table is easy */
713 f->data_hash_table[h].head_hash_offset = htole64(offset);
715 /* Move back to the previous data object, to patch in
718 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
722 o->data.next_hash_offset = htole64(offset);
725 f->data_hash_table[h].tail_hash_offset = htole64(offset);
727 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
728 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
733 int journal_file_find_field_object_with_hash(
735 const void *field, uint64_t size, uint64_t hash,
736 Object **ret, uint64_t *offset) {
738 uint64_t p, osize, h;
742 assert(field && size > 0);
744 osize = offsetof(Object, field.payload) + size;
746 if (f->header->field_hash_table_size == 0)
749 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
750 p = le64toh(f->field_hash_table[h].head_hash_offset);
755 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
759 if (le64toh(o->field.hash) == hash &&
760 le64toh(o->object.size) == osize &&
761 memcmp(o->field.payload, field, size) == 0) {
771 p = le64toh(o->field.next_hash_offset);
777 int journal_file_find_field_object(
779 const void *field, uint64_t size,
780 Object **ret, uint64_t *offset) {
785 assert(field && size > 0);
787 hash = hash64(field, size);
789 return journal_file_find_field_object_with_hash(f,
794 int journal_file_find_data_object_with_hash(
796 const void *data, uint64_t size, uint64_t hash,
797 Object **ret, uint64_t *offset) {
799 uint64_t p, osize, h;
803 assert(data || size == 0);
805 osize = offsetof(Object, data.payload) + size;
807 if (f->header->data_hash_table_size == 0)
810 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
811 p = le64toh(f->data_hash_table[h].head_hash_offset);
816 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
820 if (le64toh(o->data.hash) != hash)
823 if (o->object.flags & OBJECT_COMPRESSED) {
827 l = le64toh(o->object.size);
828 if (l <= offsetof(Object, data.payload))
831 l -= offsetof(Object, data.payload);
833 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
837 memcmp(f->compress_buffer, data, size) == 0) {
848 return -EPROTONOSUPPORT;
851 } else if (le64toh(o->object.size) == osize &&
852 memcmp(o->data.payload, data, size) == 0) {
864 p = le64toh(o->data.next_hash_offset);
870 int journal_file_find_data_object(
872 const void *data, uint64_t size,
873 Object **ret, uint64_t *offset) {
878 assert(data || size == 0);
880 hash = hash64(data, size);
882 return journal_file_find_data_object_with_hash(f,
887 static int journal_file_append_field(
889 const void *field, uint64_t size,
890 Object **ret, uint64_t *offset) {
898 assert(field && size > 0);
900 hash = hash64(field, size);
902 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
916 osize = offsetof(Object, field.payload) + size;
917 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
921 o->field.hash = htole64(hash);
922 memcpy(o->field.payload, field, size);
924 r = journal_file_link_field(f, o, p, hash);
928 /* The linking might have altered the window, so let's
929 * refresh our pointer */
930 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
935 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
949 static int journal_file_append_data(
951 const void *data, uint64_t size,
952 Object **ret, uint64_t *offset) {
958 bool compressed = false;
962 assert(data || size == 0);
964 hash = hash64(data, size);
966 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
980 osize = offsetof(Object, data.payload) + size;
981 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
985 o->data.hash = htole64(hash);
989 size >= COMPRESSION_SIZE_THRESHOLD) {
992 compressed = compress_blob(data, size, o->data.payload, &rsize);
995 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
996 o->object.flags |= OBJECT_COMPRESSED;
998 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
1003 if (!compressed && size > 0)
1004 memcpy(o->data.payload, data, size);
1006 r = journal_file_link_data(f, o, p, hash);
1010 /* The linking might have altered the window, so let's
1011 * refresh our pointer */
1012 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1016 eq = memchr(data, '=', size);
1017 if (eq && eq > data) {
1021 /* Create field object ... */
1022 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1026 /* ... and link it in. */
1027 o->data.next_field_offset = fo->field.head_data_offset;
1028 fo->field.head_data_offset = le64toh(p);
1032 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1046 uint64_t journal_file_entry_n_items(Object *o) {
1049 if (o->object.type != OBJECT_ENTRY)
1052 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1055 uint64_t journal_file_entry_array_n_items(Object *o) {
1058 if (o->object.type != OBJECT_ENTRY_ARRAY)
1061 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1064 uint64_t journal_file_hash_table_n_items(Object *o) {
1067 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068 o->object.type != OBJECT_FIELD_HASH_TABLE)
1071 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1074 static int link_entry_into_array(JournalFile *f,
1079 uint64_t n = 0, ap = 0, q, i, a, hidx;
1087 a = le64toh(*first);
1088 i = hidx = le64toh(*idx);
1091 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1095 n = journal_file_entry_array_n_items(o);
1097 o->entry_array.items[i] = htole64(p);
1098 *idx = htole64(hidx + 1);
1104 a = le64toh(o->entry_array.next_entry_array_offset);
1115 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1122 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1127 o->entry_array.items[i] = htole64(p);
1130 *first = htole64(q);
1132 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1136 o->entry_array.next_entry_array_offset = htole64(q);
1139 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1142 *idx = htole64(hidx + 1);
1147 static int link_entry_into_array_plus_one(JournalFile *f,
1162 *extra = htole64(p);
1166 i = htole64(le64toh(*idx) - 1);
1167 r = link_entry_into_array(f, first, &i, p);
1172 *idx = htole64(le64toh(*idx) + 1);
1176 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1183 p = le64toh(o->entry.items[i].object_offset);
1187 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1191 return link_entry_into_array_plus_one(f,
1192 &o->data.entry_offset,
1193 &o->data.entry_array_offset,
1198 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1206 if (o->object.type != OBJECT_ENTRY)
1209 __sync_synchronize();
1211 /* Link up the entry itself */
1212 r = link_entry_into_array(f,
1213 &f->header->entry_array_offset,
1214 &f->header->n_entries,
1219 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1221 if (f->header->head_entry_realtime == 0)
1222 f->header->head_entry_realtime = o->entry.realtime;
1224 f->header->tail_entry_realtime = o->entry.realtime;
1225 f->header->tail_entry_monotonic = o->entry.monotonic;
1227 f->tail_entry_monotonic_valid = true;
1229 /* Link up the items */
1230 n = journal_file_entry_n_items(o);
1231 for (i = 0; i < n; i++) {
1232 r = journal_file_link_entry_item(f, o, offset, i);
1240 static int journal_file_append_entry_internal(
1242 const dual_timestamp *ts,
1244 const EntryItem items[], unsigned n_items,
1246 Object **ret, uint64_t *offset) {
1253 assert(items || n_items == 0);
1256 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1258 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1262 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1263 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1264 o->entry.realtime = htole64(ts->realtime);
1265 o->entry.monotonic = htole64(ts->monotonic);
1266 o->entry.xor_hash = htole64(xor_hash);
1267 o->entry.boot_id = f->header->boot_id;
1270 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1275 r = journal_file_link_entry(f, o, np);
1288 void journal_file_post_change(JournalFile *f) {
1291 /* inotify() does not receive IN_MODIFY events from file
1292 * accesses done via mmap(). After each access we hence
1293 * trigger IN_MODIFY by truncating the journal file to its
1294 * current size which triggers IN_MODIFY. */
1296 __sync_synchronize();
1298 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1299 log_error("Failed to truncate file to its own size: %m");
1302 static int entry_item_cmp(const void *_a, const void *_b) {
1303 const EntryItem *a = _a, *b = _b;
1305 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1307 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1312 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1316 uint64_t xor_hash = 0;
1317 struct dual_timestamp _ts;
1320 assert(iovec || n_iovec == 0);
1323 dual_timestamp_get(&_ts);
1327 if (f->tail_entry_monotonic_valid &&
1328 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1332 r = journal_file_maybe_append_tag(f, ts->realtime);
1337 /* alloca() can't take 0, hence let's allocate at least one */
1338 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1340 for (i = 0; i < n_iovec; i++) {
1344 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1348 xor_hash ^= le64toh(o->data.hash);
1349 items[i].object_offset = htole64(p);
1350 items[i].hash = o->data.hash;
1353 /* Order by the position on disk, in order to improve seek
1354 * times for rotating media. */
1355 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1357 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1359 journal_file_post_change(f);
1364 typedef struct ChainCacheItem {
1365 uint64_t first; /* the array at the begin of the chain */
1366 uint64_t array; /* the cached array */
1367 uint64_t begin; /* the first item in the cached array */
1368 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1371 static void chain_cache_put(
1380 /* If the chain item to cache for this chain is the
1381 * first one it's not worth caching anything */
1385 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1386 ci = hashmap_steal_first(h);
1388 ci = new(ChainCacheItem, 1);
1395 if (hashmap_put(h, &ci->first, ci) < 0) {
1400 assert(ci->first == first);
1407 static int generic_array_get(JournalFile *f,
1410 Object **ret, uint64_t *offset) {
1413 uint64_t p = 0, a, t = 0;
1421 /* Try the chain cache first */
1422 ci = hashmap_get(f->chain_cache, &first);
1423 if (ci && i > ci->total) {
1432 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1436 k = journal_file_entry_array_n_items(o);
1438 p = le64toh(o->entry_array.items[i]);
1444 a = le64toh(o->entry_array.next_entry_array_offset);
1450 /* Let's cache this item for the next invocation */
1451 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1453 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1466 static int generic_array_get_plus_one(JournalFile *f,
1470 Object **ret, uint64_t *offset) {
1479 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1492 return generic_array_get(f, first, i-1, ret, offset);
1501 static int generic_array_bisect(JournalFile *f,
1505 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1506 direction_t direction,
1511 uint64_t a, p, t = 0, i = 0, last_p = 0;
1512 bool subtract_one = false;
1513 Object *o, *array = NULL;
1518 assert(test_object);
1520 /* Start with the first array in the chain */
1523 ci = hashmap_get(f->chain_cache, &first);
1524 if (ci && n > ci->total) {
1525 /* Ah, we have iterated this bisection array chain
1526 * previously! Let's see if we can skip ahead in the
1527 * chain, as far as the last time. But we can't jump
1528 * backwards in the chain, so let's check that
1531 r = test_object(f, ci->begin, needle);
1535 if (r == TEST_LEFT) {
1536 /* OK, what we are looking for is right of th
1537 * begin of this EntryArray, so let's jump
1538 * straight to previously cached array in the
1548 uint64_t left, right, k, lp;
1550 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1554 k = journal_file_entry_array_n_items(array);
1560 lp = p = le64toh(array->entry_array.items[i]);
1564 r = test_object(f, p, needle);
1568 if (r == TEST_FOUND)
1569 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1571 if (r == TEST_RIGHT) {
1575 if (left == right) {
1576 if (direction == DIRECTION_UP)
1577 subtract_one = true;
1583 assert(left < right);
1585 i = (left + right) / 2;
1586 p = le64toh(array->entry_array.items[i]);
1590 r = test_object(f, p, needle);
1594 if (r == TEST_FOUND)
1595 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1597 if (r == TEST_RIGHT)
1605 if (direction == DIRECTION_UP) {
1607 subtract_one = true;
1618 a = le64toh(array->entry_array.next_entry_array_offset);
1624 if (subtract_one && t == 0 && i == 0)
1627 /* Let's cache this item for the next invocation */
1628 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1630 if (subtract_one && i == 0)
1632 else if (subtract_one)
1633 p = le64toh(array->entry_array.items[i-1]);
1635 p = le64toh(array->entry_array.items[i]);
1637 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1648 *idx = t + i + (subtract_one ? -1 : 0);
1653 static int generic_array_bisect_plus_one(JournalFile *f,
1658 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1659 direction_t direction,
1665 bool step_back = false;
1669 assert(test_object);
1674 /* This bisects the array in object 'first', but first checks
1676 r = test_object(f, extra, needle);
1680 if (r == TEST_FOUND)
1681 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1683 /* if we are looking with DIRECTION_UP then we need to first
1684 see if in the actual array there is a matching entry, and
1685 return the last one of that. But if there isn't any we need
1686 to return this one. Hence remember this, and return it
1689 step_back = direction == DIRECTION_UP;
1691 if (r == TEST_RIGHT) {
1692 if (direction == DIRECTION_DOWN)
1698 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1700 if (r == 0 && step_back)
1709 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1725 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1731 else if (p < needle)
1737 int journal_file_move_to_entry_by_offset(
1740 direction_t direction,
1744 return generic_array_bisect(f,
1745 le64toh(f->header->entry_array_offset),
1746 le64toh(f->header->n_entries),
1754 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1761 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1765 if (le64toh(o->entry.seqnum) == needle)
1767 else if (le64toh(o->entry.seqnum) < needle)
1773 int journal_file_move_to_entry_by_seqnum(
1776 direction_t direction,
1780 return generic_array_bisect(f,
1781 le64toh(f->header->entry_array_offset),
1782 le64toh(f->header->n_entries),
1789 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1796 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1800 if (le64toh(o->entry.realtime) == needle)
1802 else if (le64toh(o->entry.realtime) < needle)
1808 int journal_file_move_to_entry_by_realtime(
1811 direction_t direction,
1815 return generic_array_bisect(f,
1816 le64toh(f->header->entry_array_offset),
1817 le64toh(f->header->n_entries),
1819 test_object_realtime,
1824 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1831 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1835 if (le64toh(o->entry.monotonic) == needle)
1837 else if (le64toh(o->entry.monotonic) < needle)
1843 static inline int find_data_object_by_boot_id(
1848 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1850 sd_id128_to_string(boot_id, t + 9);
1851 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1854 int journal_file_move_to_entry_by_monotonic(
1858 direction_t direction,
1867 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1873 return generic_array_bisect_plus_one(f,
1874 le64toh(o->data.entry_offset),
1875 le64toh(o->data.entry_array_offset),
1876 le64toh(o->data.n_entries),
1878 test_object_monotonic,
1883 int journal_file_next_entry(
1885 Object *o, uint64_t p,
1886 direction_t direction,
1887 Object **ret, uint64_t *offset) {
1893 assert(p > 0 || !o);
1895 n = le64toh(f->header->n_entries);
1900 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1902 if (o->object.type != OBJECT_ENTRY)
1905 r = generic_array_bisect(f,
1906 le64toh(f->header->entry_array_offset),
1907 le64toh(f->header->n_entries),
1916 if (direction == DIRECTION_DOWN) {
1929 /* And jump to it */
1930 return generic_array_get(f,
1931 le64toh(f->header->entry_array_offset),
1936 int journal_file_skip_entry(
1938 Object *o, uint64_t p,
1940 Object **ret, uint64_t *offset) {
1949 if (o->object.type != OBJECT_ENTRY)
1952 r = generic_array_bisect(f,
1953 le64toh(f->header->entry_array_offset),
1954 le64toh(f->header->n_entries),
1963 /* Calculate new index */
1965 if ((uint64_t) -skip >= i)
1968 i = i - (uint64_t) -skip;
1970 i += (uint64_t) skip;
1972 n = le64toh(f->header->n_entries);
1979 return generic_array_get(f,
1980 le64toh(f->header->entry_array_offset),
1985 int journal_file_next_entry_for_data(
1987 Object *o, uint64_t p,
1988 uint64_t data_offset,
1989 direction_t direction,
1990 Object **ret, uint64_t *offset) {
1997 assert(p > 0 || !o);
1999 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2003 n = le64toh(d->data.n_entries);
2008 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2010 if (o->object.type != OBJECT_ENTRY)
2013 r = generic_array_bisect_plus_one(f,
2014 le64toh(d->data.entry_offset),
2015 le64toh(d->data.entry_array_offset),
2016 le64toh(d->data.n_entries),
2026 if (direction == DIRECTION_DOWN) {
2040 return generic_array_get_plus_one(f,
2041 le64toh(d->data.entry_offset),
2042 le64toh(d->data.entry_array_offset),
2047 int journal_file_move_to_entry_by_offset_for_data(
2049 uint64_t data_offset,
2051 direction_t direction,
2052 Object **ret, uint64_t *offset) {
2059 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2063 return generic_array_bisect_plus_one(f,
2064 le64toh(d->data.entry_offset),
2065 le64toh(d->data.entry_array_offset),
2066 le64toh(d->data.n_entries),
2073 int journal_file_move_to_entry_by_monotonic_for_data(
2075 uint64_t data_offset,
2078 direction_t direction,
2079 Object **ret, uint64_t *offset) {
2087 /* First, seek by time */
2088 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2094 r = generic_array_bisect_plus_one(f,
2095 le64toh(o->data.entry_offset),
2096 le64toh(o->data.entry_array_offset),
2097 le64toh(o->data.n_entries),
2099 test_object_monotonic,
2105 /* And now, continue seeking until we find an entry that
2106 * exists in both bisection arrays */
2112 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2116 r = generic_array_bisect_plus_one(f,
2117 le64toh(d->data.entry_offset),
2118 le64toh(d->data.entry_array_offset),
2119 le64toh(d->data.n_entries),
2127 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2131 r = generic_array_bisect_plus_one(f,
2132 le64toh(o->data.entry_offset),
2133 le64toh(o->data.entry_array_offset),
2134 le64toh(o->data.n_entries),
2158 int journal_file_move_to_entry_by_seqnum_for_data(
2160 uint64_t data_offset,
2162 direction_t direction,
2163 Object **ret, uint64_t *offset) {
2170 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2174 return generic_array_bisect_plus_one(f,
2175 le64toh(d->data.entry_offset),
2176 le64toh(d->data.entry_array_offset),
2177 le64toh(d->data.n_entries),
2184 int journal_file_move_to_entry_by_realtime_for_data(
2186 uint64_t data_offset,
2188 direction_t direction,
2189 Object **ret, uint64_t *offset) {
2196 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2200 return generic_array_bisect_plus_one(f,
2201 le64toh(d->data.entry_offset),
2202 le64toh(d->data.entry_array_offset),
2203 le64toh(d->data.n_entries),
2205 test_object_realtime,
2210 void journal_file_dump(JournalFile *f) {
2217 journal_file_print_header(f);
2219 p = le64toh(f->header->header_size);
2221 r = journal_file_move_to_object(f, -1, p, &o);
2225 switch (o->object.type) {
2228 printf("Type: OBJECT_UNUSED\n");
2232 printf("Type: OBJECT_DATA\n");
2236 printf("Type: OBJECT_FIELD\n");
2240 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2241 le64toh(o->entry.seqnum),
2242 le64toh(o->entry.monotonic),
2243 le64toh(o->entry.realtime));
2246 case OBJECT_FIELD_HASH_TABLE:
2247 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2250 case OBJECT_DATA_HASH_TABLE:
2251 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2254 case OBJECT_ENTRY_ARRAY:
2255 printf("Type: OBJECT_ENTRY_ARRAY\n");
2259 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2260 le64toh(o->tag.seqnum),
2261 le64toh(o->tag.epoch));
2265 printf("Type: unknown (%u)\n", o->object.type);
2269 if (o->object.flags & OBJECT_COMPRESSED)
2270 printf("Flags: COMPRESSED\n");
2272 if (p == le64toh(f->header->tail_object_offset))
2275 p = p + ALIGN64(le64toh(o->object.size));
2280 log_error("File corrupt");
2283 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2286 x = format_timestamp(buf, l, t);
2292 void journal_file_print_header(JournalFile *f) {
2293 char a[33], b[33], c[33], d[33];
2294 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2296 char bytes[FORMAT_BYTES_MAX];
2300 printf("File Path: %s\n"
2304 "Sequential Number ID: %s\n"
2306 "Compatible Flags:%s%s\n"
2307 "Incompatible Flags:%s%s\n"
2308 "Header size: %"PRIu64"\n"
2309 "Arena size: %"PRIu64"\n"
2310 "Data Hash Table Size: %"PRIu64"\n"
2311 "Field Hash Table Size: %"PRIu64"\n"
2312 "Rotate Suggested: %s\n"
2313 "Head Sequential Number: %"PRIu64"\n"
2314 "Tail Sequential Number: %"PRIu64"\n"
2315 "Head Realtime Timestamp: %s\n"
2316 "Tail Realtime Timestamp: %s\n"
2317 "Tail Monotonic Timestamp: %s\n"
2318 "Objects: %"PRIu64"\n"
2319 "Entry Objects: %"PRIu64"\n",
2321 sd_id128_to_string(f->header->file_id, a),
2322 sd_id128_to_string(f->header->machine_id, b),
2323 sd_id128_to_string(f->header->boot_id, c),
2324 sd_id128_to_string(f->header->seqnum_id, d),
2325 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2326 f->header->state == STATE_ONLINE ? "ONLINE" :
2327 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2328 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2329 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2330 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2331 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2332 le64toh(f->header->header_size),
2333 le64toh(f->header->arena_size),
2334 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2335 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2336 yes_no(journal_file_rotate_suggested(f, 0)),
2337 le64toh(f->header->head_entry_seqnum),
2338 le64toh(f->header->tail_entry_seqnum),
2339 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2340 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2341 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2342 le64toh(f->header->n_objects),
2343 le64toh(f->header->n_entries));
2345 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2346 printf("Data Objects: %"PRIu64"\n"
2347 "Data Hash Table Fill: %.1f%%\n",
2348 le64toh(f->header->n_data),
2349 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2351 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2352 printf("Field Objects: %"PRIu64"\n"
2353 "Field Hash Table Fill: %.1f%%\n",
2354 le64toh(f->header->n_fields),
2355 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2357 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2358 printf("Tag Objects: %"PRIu64"\n",
2359 le64toh(f->header->n_tags));
2360 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2361 printf("Entry Array Objects: %"PRIu64"\n",
2362 le64toh(f->header->n_entry_arrays));
2364 if (fstat(f->fd, &st) >= 0)
2365 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2368 int journal_file_open(
2374 JournalMetrics *metrics,
2375 MMapCache *mmap_cache,
2376 JournalFile *template,
2377 JournalFile **ret) {
2381 bool newly_created = false;
2386 if ((flags & O_ACCMODE) != O_RDONLY &&
2387 (flags & O_ACCMODE) != O_RDWR)
2390 if (!endswith(fname, ".journal") &&
2391 !endswith(fname, ".journal~"))
2394 f = new0(JournalFile, 1);
2402 f->prot = prot_from_flags(flags);
2403 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2405 f->compress = compress;
2412 f->mmap = mmap_cache_ref(mmap_cache);
2414 f->mmap = mmap_cache_new();
2421 f->path = strdup(fname);
2427 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2428 if (!f->chain_cache) {
2433 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2439 if (fstat(f->fd, &f->last_stat) < 0) {
2444 if (f->last_stat.st_size == 0 && f->writable) {
2448 /* Let's attach the creation time to the journal file,
2449 * so that the vacuuming code knows the age of this
2450 * file even if the file might end up corrupted one
2451 * day... Ideally we'd just use the creation time many
2452 * file systems maintain for each file, but there is
2453 * currently no usable API to query this, hence let's
2454 * emulate this via extended attributes. If extended
2455 * attributes are not supported we'll just skip this,
2456 * and rely solely on mtime/atime/ctime of the file.*/
2458 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2459 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2463 /* Try to load the FSPRG state, and if we can't, then
2464 * just don't do sealing */
2466 r = journal_file_fss_load(f);
2472 r = journal_file_init_header(f, template);
2476 if (fstat(f->fd, &f->last_stat) < 0) {
2481 newly_created = true;
2484 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2489 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2490 if (f->header == MAP_FAILED) {
2496 if (!newly_created) {
2497 r = journal_file_verify_header(f);
2503 if (!newly_created && f->writable) {
2504 r = journal_file_fss_load(f);
2512 journal_default_metrics(metrics, f->fd);
2513 f->metrics = *metrics;
2514 } else if (template)
2515 f->metrics = template->metrics;
2517 r = journal_file_refresh_header(f);
2523 r = journal_file_hmac_setup(f);
2528 if (newly_created) {
2529 r = journal_file_setup_field_hash_table(f);
2533 r = journal_file_setup_data_hash_table(f);
2538 r = journal_file_append_first_tag(f);
2544 r = journal_file_map_field_hash_table(f);
2548 r = journal_file_map_data_hash_table(f);
2556 journal_file_close(f);
2561 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2562 _cleanup_free_ char *p = NULL;
2564 JournalFile *old_file, *new_file = NULL;
2572 if (!old_file->writable)
2575 if (!endswith(old_file->path, ".journal"))
2578 l = strlen(old_file->path);
2579 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2580 (int) l - 8, old_file->path,
2581 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2582 le64toh((*f)->header->head_entry_seqnum),
2583 le64toh((*f)->header->head_entry_realtime));
2587 r = rename(old_file->path, p);
2591 old_file->header->state = STATE_ARCHIVED;
2593 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2594 journal_file_close(old_file);
2600 int journal_file_open_reliably(
2606 JournalMetrics *metrics,
2607 MMapCache *mmap_cache,
2608 JournalFile *template,
2609 JournalFile **ret) {
2613 _cleanup_free_ char *p = NULL;
2615 r = journal_file_open(fname, flags, mode, compress, seal,
2616 metrics, mmap_cache, template, ret);
2617 if (r != -EBADMSG && /* corrupted */
2618 r != -ENODATA && /* truncated */
2619 r != -EHOSTDOWN && /* other machine */
2620 r != -EPROTONOSUPPORT && /* incompatible feature */
2621 r != -EBUSY && /* unclean shutdown */
2622 r != -ESHUTDOWN /* already archived */)
2625 if ((flags & O_ACCMODE) == O_RDONLY)
2628 if (!(flags & O_CREAT))
2631 if (!endswith(fname, ".journal"))
2634 /* The file is corrupted. Rotate it away and try it again (but only once) */
2637 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2639 (unsigned long long) now(CLOCK_REALTIME),
2643 r = rename(fname, p);
2647 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2649 return journal_file_open(fname, flags, mode, compress, seal,
2650 metrics, mmap_cache, template, ret);
2653 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2655 uint64_t q, xor_hash = 0;
2668 ts.monotonic = le64toh(o->entry.monotonic);
2669 ts.realtime = le64toh(o->entry.realtime);
2671 if (to->tail_entry_monotonic_valid &&
2672 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2675 n = journal_file_entry_n_items(o);
2676 items = alloca(sizeof(EntryItem) * n);
2678 for (i = 0; i < n; i++) {
2685 q = le64toh(o->entry.items[i].object_offset);
2686 le_hash = o->entry.items[i].hash;
2688 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2692 if (le_hash != o->data.hash)
2695 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2698 /* We hit the limit on 32bit machines */
2699 if ((uint64_t) t != l)
2702 if (o->object.flags & OBJECT_COMPRESSED) {
2706 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2709 data = from->compress_buffer;
2712 return -EPROTONOSUPPORT;
2715 data = o->data.payload;
2717 r = journal_file_append_data(to, data, l, &u, &h);
2721 xor_hash ^= le64toh(u->data.hash);
2722 items[i].object_offset = htole64(h);
2723 items[i].hash = u->data.hash;
2725 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2730 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2733 void journal_default_metrics(JournalMetrics *m, int fd) {
2734 uint64_t fs_size = 0;
2736 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2741 if (fstatvfs(fd, &ss) >= 0)
2742 fs_size = ss.f_frsize * ss.f_blocks;
2744 if (m->max_use == (uint64_t) -1) {
2747 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2749 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2750 m->max_use = DEFAULT_MAX_USE_UPPER;
2752 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2753 m->max_use = DEFAULT_MAX_USE_LOWER;
2755 m->max_use = DEFAULT_MAX_USE_LOWER;
2757 m->max_use = PAGE_ALIGN(m->max_use);
2759 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2760 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2763 if (m->max_size == (uint64_t) -1) {
2764 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2766 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2767 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2769 m->max_size = PAGE_ALIGN(m->max_size);
2771 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2772 m->max_size = JOURNAL_FILE_SIZE_MIN;
2774 if (m->max_size*2 > m->max_use)
2775 m->max_use = m->max_size*2;
2777 if (m->min_size == (uint64_t) -1)
2778 m->min_size = JOURNAL_FILE_SIZE_MIN;
2780 m->min_size = PAGE_ALIGN(m->min_size);
2782 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2783 m->min_size = JOURNAL_FILE_SIZE_MIN;
2785 if (m->min_size > m->max_size)
2786 m->max_size = m->min_size;
2789 if (m->keep_free == (uint64_t) -1) {
2792 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2794 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2795 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2798 m->keep_free = DEFAULT_KEEP_FREE;
2801 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2802 format_bytes(a, sizeof(a), m->max_use),
2803 format_bytes(b, sizeof(b), m->max_size),
2804 format_bytes(c, sizeof(c), m->min_size),
2805 format_bytes(d, sizeof(d), m->keep_free));
2808 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2813 if (f->header->head_entry_realtime == 0)
2816 *from = le64toh(f->header->head_entry_realtime);
2820 if (f->header->tail_entry_realtime == 0)
2823 *to = le64toh(f->header->tail_entry_realtime);
2829 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2837 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2841 if (le64toh(o->data.n_entries) <= 0)
2845 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2849 *from = le64toh(o->entry.monotonic);
2853 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2857 r = generic_array_get_plus_one(f,
2858 le64toh(o->data.entry_offset),
2859 le64toh(o->data.entry_array_offset),
2860 le64toh(o->data.n_entries)-1,
2865 *to = le64toh(o->entry.monotonic);
2871 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2874 /* If we gained new header fields we gained new features,
2875 * hence suggest a rotation */
2876 if (le64toh(f->header->header_size) < sizeof(Header)) {
2877 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2881 /* Let's check if the hash tables grew over a certain fill
2882 * level (75%, borrowing this value from Java's hash table
2883 * implementation), and if so suggest a rotation. To calculate
2884 * the fill level we need the n_data field, which only exists
2885 * in newer versions. */
2887 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2888 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2889 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2891 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2892 le64toh(f->header->n_data),
2893 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2894 (unsigned long long) f->last_stat.st_size,
2895 f->last_stat.st_size / le64toh(f->header->n_data));
2899 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2900 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2901 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2903 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2904 le64toh(f->header->n_fields),
2905 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2909 /* Are the data objects properly indexed by field objects? */
2910 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2911 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2912 le64toh(f->header->n_data) > 0 &&
2913 le64toh(f->header->n_fields) == 0)
2916 if (max_file_usec > 0) {
2919 h = le64toh(f->header->head_entry_realtime);
2920 t = now(CLOCK_REALTIME);
2922 if (h > 0 && t > h + max_file_usec)