1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 /* How much to increase the journal file size at once each time we allocate something new. */
72 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
74 static int journal_file_set_online(JournalFile *f) {
80 if (!(f->fd >= 0 && f->header))
83 switch(f->header->state) {
88 f->header->state = STATE_ONLINE;
97 int journal_file_set_offline(JournalFile *f) {
103 if (!(f->fd >= 0 && f->header))
106 if (f->header->state != STATE_ONLINE)
111 f->header->state = STATE_OFFLINE;
118 void journal_file_close(JournalFile *f) {
122 /* Write the final tag */
123 if (f->seal && f->writable)
124 journal_file_append_tag(f);
127 /* Sync everything to disk, before we mark the file offline */
128 if (f->mmap && f->fd >= 0)
129 mmap_cache_close_fd(f->mmap, f->fd);
131 journal_file_set_offline(f);
134 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
140 mmap_cache_unref(f->mmap);
142 hashmap_free_free(f->chain_cache);
145 free(f->compress_buffer);
150 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
151 else if (f->fsprg_state)
152 free(f->fsprg_state);
157 gcry_md_close(f->hmac);
163 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
171 memcpy(h.signature, HEADER_SIGNATURE, 8);
172 h.header_size = htole64(ALIGN64(sizeof(h)));
174 h.incompatible_flags =
175 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
180 r = sd_id128_randomize(&h.file_id);
185 h.seqnum_id = template->header->seqnum_id;
186 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
188 h.seqnum_id = h.file_id;
190 k = pwrite(f->fd, &h, sizeof(h), 0);
200 static int journal_file_refresh_header(JournalFile *f) {
206 r = sd_id128_get_machine(&f->header->machine_id);
210 r = sd_id128_get_boot(&boot_id);
214 if (sd_id128_equal(boot_id, f->header->boot_id))
215 f->tail_entry_monotonic_valid = true;
217 f->header->boot_id = boot_id;
219 journal_file_set_online(f);
221 /* Sync the online state to disk */
227 static int journal_file_verify_header(JournalFile *f) {
230 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233 /* In both read and write mode we refuse to open files with
234 * incompatible flags we don't know */
236 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
237 return -EPROTONOSUPPORT;
239 if (f->header->incompatible_flags != 0)
240 return -EPROTONOSUPPORT;
243 /* When open for writing we refuse to open files with
244 * compatible flags, too */
247 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
248 return -EPROTONOSUPPORT;
250 if (f->header->compatible_flags != 0)
251 return -EPROTONOSUPPORT;
255 if (f->header->state >= _STATE_MAX)
258 /* The first addition was n_data, so check that we are at least this large */
259 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
272 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
273 !VALID64(le64toh(f->header->tail_object_offset)) ||
274 !VALID64(le64toh(f->header->entry_array_offset)))
277 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
278 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
279 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
285 sd_id128_t machine_id;
288 r = sd_id128_get_machine(&machine_id);
292 if (!sd_id128_equal(machine_id, f->header->machine_id))
295 state = f->header->state;
297 if (state == STATE_ONLINE) {
298 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
300 } else if (state == STATE_ARCHIVED)
302 else if (state != STATE_OFFLINE) {
303 log_debug("Journal file %s has unknown state %u.", f->path, state);
308 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
333 if (new_size <= old_size)
336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
342 if (fstatvfs(f->fd, &svfs) >= 0) {
345 available = svfs.f_bfree * svfs.f_bsize;
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
352 if (new_size - old_size > available)
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
369 if (fstat(f->fd, &f->last_stat) < 0)
372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
377 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
384 /* Avoid SIGBUS on invalid accesses */
385 if (offset + size > (uint64_t) f->last_stat.st_size) {
386 /* Hmm, out of range? Let's refresh the fstat() data
387 * first, before we trust that check. */
389 if (fstat(f->fd, &f->last_stat) < 0 ||
390 offset + size > (uint64_t) f->last_stat.st_size)
391 return -EADDRNOTAVAIL;
394 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
397 static uint64_t minimum_header_size(Object *o) {
399 static const uint64_t table[] = {
400 [OBJECT_DATA] = sizeof(DataObject),
401 [OBJECT_FIELD] = sizeof(FieldObject),
402 [OBJECT_ENTRY] = sizeof(EntryObject),
403 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406 [OBJECT_TAG] = sizeof(TagObject),
409 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410 return sizeof(ObjectHeader);
412 return table[o->object.type];
415 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
424 /* Objects may only be located at multiple of 64 bit */
425 if (!VALID64(offset))
429 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
434 s = le64toh(o->object.size);
436 if (s < sizeof(ObjectHeader))
439 if (o->object.type <= OBJECT_UNUSED)
442 if (s < minimum_header_size(o))
445 if (type > 0 && o->object.type != type)
448 if (s > sizeof(ObjectHeader)) {
449 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
465 r = le64toh(f->header->tail_entry_seqnum) + 1;
468 /* If an external seqnum counter was passed, we update
469 * both the local and the external one, and set it to
470 * the maximum of both */
478 f->header->tail_entry_seqnum = htole64(r);
480 if (f->header->head_entry_seqnum == 0)
481 f->header->head_entry_seqnum = htole64(r);
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
493 assert(type > 0 && type < _OBJECT_TYPE_MAX);
494 assert(size >= sizeof(ObjectHeader));
498 r = journal_file_set_online(f);
502 p = le64toh(f->header->tail_object_offset);
504 p = le64toh(f->header->header_size);
506 r = journal_file_move_to_object(f, -1, p, &tail);
510 p += ALIGN64(le64toh(tail->object.size));
513 r = journal_file_allocate(f, p, size);
517 r = journal_file_move_to(f, type, false, p, size, &t);
524 o->object.type = type;
525 o->object.size = htole64(size);
527 f->header->tail_object_offset = htole64(p);
528 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
543 /* We estimate that we need 1 hash table entry per 768 of
544 journal file and we want to make sure we never get beyond
545 75% fill level. Calculate the hash table size for the
546 maximum file size based on these metrics. */
548 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550 s = DEFAULT_DATA_HASH_TABLE_SIZE;
552 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
554 r = journal_file_append_object(f,
555 OBJECT_DATA_HASH_TABLE,
556 offsetof(Object, hash_table.items) + s,
561 memzero(o->hash_table.items, s);
563 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564 f->header->data_hash_table_size = htole64(s);
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
576 /* We use a fixed size hash table for the fields as this
577 * number should grow very slowly only */
579 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580 r = journal_file_append_object(f,
581 OBJECT_FIELD_HASH_TABLE,
582 offsetof(Object, hash_table.items) + s,
587 memzero(o->hash_table.items, s);
589 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590 f->header->field_hash_table_size = htole64(s);
595 static int journal_file_map_data_hash_table(JournalFile *f) {
602 p = le64toh(f->header->data_hash_table_offset);
603 s = le64toh(f->header->data_hash_table_size);
605 r = journal_file_move_to(f,
606 OBJECT_DATA_HASH_TABLE,
613 f->data_hash_table = t;
617 static int journal_file_map_field_hash_table(JournalFile *f) {
624 p = le64toh(f->header->field_hash_table_offset);
625 s = le64toh(f->header->field_hash_table_size);
627 r = journal_file_move_to(f,
628 OBJECT_FIELD_HASH_TABLE,
635 f->field_hash_table = t;
639 static int journal_file_link_field(
652 if (o->object.type != OBJECT_FIELD)
655 /* This might alter the window we are looking at */
657 o->field.next_hash_offset = o->field.head_data_offset = 0;
659 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660 p = le64toh(f->field_hash_table[h].tail_hash_offset);
662 f->field_hash_table[h].head_hash_offset = htole64(offset);
664 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
668 o->field.next_hash_offset = htole64(offset);
671 f->field_hash_table[h].tail_hash_offset = htole64(offset);
673 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
679 static int journal_file_link_data(
692 if (o->object.type != OBJECT_DATA)
695 /* This might alter the window we are looking at */
697 o->data.next_hash_offset = o->data.next_field_offset = 0;
698 o->data.entry_offset = o->data.entry_array_offset = 0;
699 o->data.n_entries = 0;
701 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702 p = le64toh(f->data_hash_table[h].tail_hash_offset);
704 /* Only entry in the hash table is easy */
705 f->data_hash_table[h].head_hash_offset = htole64(offset);
707 /* Move back to the previous data object, to patch in
710 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
714 o->data.next_hash_offset = htole64(offset);
717 f->data_hash_table[h].tail_hash_offset = htole64(offset);
719 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
725 int journal_file_find_field_object_with_hash(
727 const void *field, uint64_t size, uint64_t hash,
728 Object **ret, uint64_t *offset) {
730 uint64_t p, osize, h;
734 assert(field && size > 0);
736 osize = offsetof(Object, field.payload) + size;
738 if (f->header->field_hash_table_size == 0)
741 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742 p = le64toh(f->field_hash_table[h].head_hash_offset);
747 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
751 if (le64toh(o->field.hash) == hash &&
752 le64toh(o->object.size) == osize &&
753 memcmp(o->field.payload, field, size) == 0) {
763 p = le64toh(o->field.next_hash_offset);
769 int journal_file_find_field_object(
771 const void *field, uint64_t size,
772 Object **ret, uint64_t *offset) {
777 assert(field && size > 0);
779 hash = hash64(field, size);
781 return journal_file_find_field_object_with_hash(f,
786 int journal_file_find_data_object_with_hash(
788 const void *data, uint64_t size, uint64_t hash,
789 Object **ret, uint64_t *offset) {
791 uint64_t p, osize, h;
795 assert(data || size == 0);
797 osize = offsetof(Object, data.payload) + size;
799 if (f->header->data_hash_table_size == 0)
802 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803 p = le64toh(f->data_hash_table[h].head_hash_offset);
808 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
812 if (le64toh(o->data.hash) != hash)
815 if (o->object.flags & OBJECT_COMPRESSED) {
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
823 l -= offsetof(Object, data.payload);
825 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
829 memcmp(f->compress_buffer, data, size) == 0) {
840 return -EPROTONOSUPPORT;
843 } else if (le64toh(o->object.size) == osize &&
844 memcmp(o->data.payload, data, size) == 0) {
856 p = le64toh(o->data.next_hash_offset);
862 int journal_file_find_data_object(
864 const void *data, uint64_t size,
865 Object **ret, uint64_t *offset) {
870 assert(data || size == 0);
872 hash = hash64(data, size);
874 return journal_file_find_data_object_with_hash(f,
879 static int journal_file_append_field(
881 const void *field, uint64_t size,
882 Object **ret, uint64_t *offset) {
890 assert(field && size > 0);
892 hash = hash64(field, size);
894 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
908 osize = offsetof(Object, field.payload) + size;
909 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
913 o->field.hash = htole64(hash);
914 memcpy(o->field.payload, field, size);
916 r = journal_file_link_field(f, o, p, hash);
920 /* The linking might have altered the window, so let's
921 * refresh our pointer */
922 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
927 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
941 static int journal_file_append_data(
943 const void *data, uint64_t size,
944 Object **ret, uint64_t *offset) {
950 bool compressed = false;
954 assert(data || size == 0);
956 hash = hash64(data, size);
958 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
972 osize = offsetof(Object, data.payload) + size;
973 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
977 o->data.hash = htole64(hash);
981 size >= COMPRESSION_SIZE_THRESHOLD) {
984 compressed = compress_blob(data, size, o->data.payload, &rsize);
987 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
988 o->object.flags |= OBJECT_COMPRESSED;
990 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
995 if (!compressed && size > 0)
996 memcpy(o->data.payload, data, size);
998 r = journal_file_link_data(f, o, p, hash);
1002 /* The linking might have altered the window, so let's
1003 * refresh our pointer */
1004 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1011 eq = memchr(data, '=', size);
1012 if (eq && eq > data) {
1016 /* Create field object ... */
1017 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1021 /* ... and link it in. */
1022 o->data.next_field_offset = fo->field.head_data_offset;
1023 fo->field.head_data_offset = le64toh(p);
1027 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1041 uint64_t journal_file_entry_n_items(Object *o) {
1044 if (o->object.type != OBJECT_ENTRY)
1047 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1050 uint64_t journal_file_entry_array_n_items(Object *o) {
1053 if (o->object.type != OBJECT_ENTRY_ARRAY)
1056 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1059 uint64_t journal_file_hash_table_n_items(Object *o) {
1062 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1063 o->object.type != OBJECT_FIELD_HASH_TABLE)
1066 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1069 static int link_entry_into_array(JournalFile *f,
1074 uint64_t n = 0, ap = 0, q, i, a, hidx;
1082 a = le64toh(*first);
1083 i = hidx = le64toh(*idx);
1086 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1090 n = journal_file_entry_array_n_items(o);
1092 o->entry_array.items[i] = htole64(p);
1093 *idx = htole64(hidx + 1);
1099 a = le64toh(o->entry_array.next_entry_array_offset);
1110 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1111 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1117 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1122 o->entry_array.items[i] = htole64(p);
1125 *first = htole64(q);
1127 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1131 o->entry_array.next_entry_array_offset = htole64(q);
1134 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1135 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1137 *idx = htole64(hidx + 1);
1142 static int link_entry_into_array_plus_one(JournalFile *f,
1157 *extra = htole64(p);
1161 i = htole64(le64toh(*idx) - 1);
1162 r = link_entry_into_array(f, first, &i, p);
1167 *idx = htole64(le64toh(*idx) + 1);
1171 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1178 p = le64toh(o->entry.items[i].object_offset);
1182 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1186 return link_entry_into_array_plus_one(f,
1187 &o->data.entry_offset,
1188 &o->data.entry_array_offset,
1193 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1201 if (o->object.type != OBJECT_ENTRY)
1204 __sync_synchronize();
1206 /* Link up the entry itself */
1207 r = link_entry_into_array(f,
1208 &f->header->entry_array_offset,
1209 &f->header->n_entries,
1214 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1216 if (f->header->head_entry_realtime == 0)
1217 f->header->head_entry_realtime = o->entry.realtime;
1219 f->header->tail_entry_realtime = o->entry.realtime;
1220 f->header->tail_entry_monotonic = o->entry.monotonic;
1222 f->tail_entry_monotonic_valid = true;
1224 /* Link up the items */
1225 n = journal_file_entry_n_items(o);
1226 for (i = 0; i < n; i++) {
1227 r = journal_file_link_entry_item(f, o, offset, i);
1235 static int journal_file_append_entry_internal(
1237 const dual_timestamp *ts,
1239 const EntryItem items[], unsigned n_items,
1241 Object **ret, uint64_t *offset) {
1248 assert(items || n_items == 0);
1251 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1253 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1257 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1258 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1259 o->entry.realtime = htole64(ts->realtime);
1260 o->entry.monotonic = htole64(ts->monotonic);
1261 o->entry.xor_hash = htole64(xor_hash);
1262 o->entry.boot_id = f->header->boot_id;
1265 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1270 r = journal_file_link_entry(f, o, np);
1283 void journal_file_post_change(JournalFile *f) {
1286 /* inotify() does not receive IN_MODIFY events from file
1287 * accesses done via mmap(). After each access we hence
1288 * trigger IN_MODIFY by truncating the journal file to its
1289 * current size which triggers IN_MODIFY. */
1291 __sync_synchronize();
1293 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1294 log_error("Failed to truncate file to its own size: %m");
1297 static int entry_item_cmp(const void *_a, const void *_b) {
1298 const EntryItem *a = _a, *b = _b;
1300 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1302 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1307 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1311 uint64_t xor_hash = 0;
1312 struct dual_timestamp _ts;
1315 assert(iovec || n_iovec == 0);
1318 dual_timestamp_get(&_ts);
1322 if (f->tail_entry_monotonic_valid &&
1323 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1327 r = journal_file_maybe_append_tag(f, ts->realtime);
1332 /* alloca() can't take 0, hence let's allocate at least one */
1333 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1335 for (i = 0; i < n_iovec; i++) {
1339 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1343 xor_hash ^= le64toh(o->data.hash);
1344 items[i].object_offset = htole64(p);
1345 items[i].hash = o->data.hash;
1348 /* Order by the position on disk, in order to improve seek
1349 * times for rotating media. */
1350 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1352 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1354 journal_file_post_change(f);
1359 typedef struct ChainCacheItem {
1360 uint64_t first; /* the array at the beginning of the chain */
1361 uint64_t array; /* the cached array */
1362 uint64_t begin; /* the first item in the cached array */
1363 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1364 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1367 static void chain_cache_put(
1374 uint64_t last_index) {
1377 /* If the chain item to cache for this chain is the
1378 * first one it's not worth caching anything */
1382 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1383 ci = hashmap_steal_first(h);
1385 ci = new(ChainCacheItem, 1);
1392 if (hashmap_put(h, &ci->first, ci) < 0) {
1397 assert(ci->first == first);
1402 ci->last_index = last_index;
1405 static int generic_array_get(
1409 Object **ret, uint64_t *offset) {
1412 uint64_t p = 0, a, t = 0;
1420 /* Try the chain cache first */
1421 ci = hashmap_get(f->chain_cache, &first);
1422 if (ci && i > ci->total) {
1431 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1435 k = journal_file_entry_array_n_items(o);
1437 p = le64toh(o->entry_array.items[i]);
1443 a = le64toh(o->entry_array.next_entry_array_offset);
1449 /* Let's cache this item for the next invocation */
1450 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1452 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1465 static int generic_array_get_plus_one(
1470 Object **ret, uint64_t *offset) {
1479 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1492 return generic_array_get(f, first, i-1, ret, offset);
1501 static int generic_array_bisect(
1506 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1507 direction_t direction,
1512 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1513 bool subtract_one = false;
1514 Object *o, *array = NULL;
1519 assert(test_object);
1521 /* Start with the first array in the chain */
1524 ci = hashmap_get(f->chain_cache, &first);
1525 if (ci && n > ci->total) {
1526 /* Ah, we have iterated this bisection array chain
1527 * previously! Let's see if we can skip ahead in the
1528 * chain, as far as the last time. But we can't jump
1529 * backwards in the chain, so let's check that
1532 r = test_object(f, ci->begin, needle);
1536 if (r == TEST_LEFT) {
1537 /* OK, what we are looking for is right of the
1538 * begin of this EntryArray, so let's jump
1539 * straight to previously cached array in the
1545 last_index = ci->last_index;
1550 uint64_t left, right, k, lp;
1552 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1556 k = journal_file_entry_array_n_items(array);
1562 lp = p = le64toh(array->entry_array.items[i]);
1566 r = test_object(f, p, needle);
1570 if (r == TEST_FOUND)
1571 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1573 if (r == TEST_RIGHT) {
1577 if (last_index != (uint64_t) -1) {
1578 assert(last_index <= right);
1580 /* If we cached the last index we
1581 * looked at, let's try to not to jump
1582 * too wildly around and see if we can
1583 * limit the range to look at early to
1584 * the immediate neighbors of the last
1585 * index we looked at. */
1587 if (last_index > 0) {
1588 uint64_t x = last_index - 1;
1590 p = le64toh(array->entry_array.items[x]);
1594 r = test_object(f, p, needle);
1598 if (r == TEST_FOUND)
1599 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1601 if (r == TEST_RIGHT)
1607 if (last_index < right) {
1608 uint64_t y = last_index + 1;
1610 p = le64toh(array->entry_array.items[y]);
1614 r = test_object(f, p, needle);
1618 if (r == TEST_FOUND)
1619 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1621 if (r == TEST_RIGHT)
1629 if (left == right) {
1630 if (direction == DIRECTION_UP)
1631 subtract_one = true;
1637 assert(left < right);
1638 i = (left + right) / 2;
1640 p = le64toh(array->entry_array.items[i]);
1644 r = test_object(f, p, needle);
1648 if (r == TEST_FOUND)
1649 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1651 if (r == TEST_RIGHT)
1659 if (direction == DIRECTION_UP) {
1661 subtract_one = true;
1672 last_index = (uint64_t) -1;
1673 a = le64toh(array->entry_array.next_entry_array_offset);
1679 if (subtract_one && t == 0 && i == 0)
1682 /* Let's cache this item for the next invocation */
1683 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1685 if (subtract_one && i == 0)
1687 else if (subtract_one)
1688 p = le64toh(array->entry_array.items[i-1]);
1690 p = le64toh(array->entry_array.items[i]);
1692 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1703 *idx = t + i + (subtract_one ? -1 : 0);
1709 static int generic_array_bisect_plus_one(
1715 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1716 direction_t direction,
1722 bool step_back = false;
1726 assert(test_object);
1731 /* This bisects the array in object 'first', but first checks
1733 r = test_object(f, extra, needle);
1737 if (r == TEST_FOUND)
1738 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1740 /* if we are looking with DIRECTION_UP then we need to first
1741 see if in the actual array there is a matching entry, and
1742 return the last one of that. But if there isn't any we need
1743 to return this one. Hence remember this, and return it
1746 step_back = direction == DIRECTION_UP;
1748 if (r == TEST_RIGHT) {
1749 if (direction == DIRECTION_DOWN)
1755 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1757 if (r == 0 && step_back)
1766 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1782 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1788 else if (p < needle)
1794 int journal_file_move_to_entry_by_offset(
1797 direction_t direction,
1801 return generic_array_bisect(f,
1802 le64toh(f->header->entry_array_offset),
1803 le64toh(f->header->n_entries),
1811 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1818 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1822 if (le64toh(o->entry.seqnum) == needle)
1824 else if (le64toh(o->entry.seqnum) < needle)
1830 int journal_file_move_to_entry_by_seqnum(
1833 direction_t direction,
1837 return generic_array_bisect(f,
1838 le64toh(f->header->entry_array_offset),
1839 le64toh(f->header->n_entries),
1846 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1853 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1857 if (le64toh(o->entry.realtime) == needle)
1859 else if (le64toh(o->entry.realtime) < needle)
1865 int journal_file_move_to_entry_by_realtime(
1868 direction_t direction,
1872 return generic_array_bisect(f,
1873 le64toh(f->header->entry_array_offset),
1874 le64toh(f->header->n_entries),
1876 test_object_realtime,
1881 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1888 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1892 if (le64toh(o->entry.monotonic) == needle)
1894 else if (le64toh(o->entry.monotonic) < needle)
1900 static inline int find_data_object_by_boot_id(
1905 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1907 sd_id128_to_string(boot_id, t + 9);
1908 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1911 int journal_file_move_to_entry_by_monotonic(
1915 direction_t direction,
1924 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1930 return generic_array_bisect_plus_one(f,
1931 le64toh(o->data.entry_offset),
1932 le64toh(o->data.entry_array_offset),
1933 le64toh(o->data.n_entries),
1935 test_object_monotonic,
1940 int journal_file_next_entry(
1942 Object *o, uint64_t p,
1943 direction_t direction,
1944 Object **ret, uint64_t *offset) {
1950 assert(p > 0 || !o);
1952 n = le64toh(f->header->n_entries);
1957 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1959 if (o->object.type != OBJECT_ENTRY)
1962 r = generic_array_bisect(f,
1963 le64toh(f->header->entry_array_offset),
1964 le64toh(f->header->n_entries),
1973 if (direction == DIRECTION_DOWN) {
1986 /* And jump to it */
1987 r = generic_array_get(f,
1988 le64toh(f->header->entry_array_offset),
1995 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1996 log_debug("%s: entry array corrupted at entry %"PRIu64,
2007 int journal_file_skip_entry(
2009 Object *o, uint64_t p,
2011 Object **ret, uint64_t *offset) {
2020 if (o->object.type != OBJECT_ENTRY)
2023 r = generic_array_bisect(f,
2024 le64toh(f->header->entry_array_offset),
2025 le64toh(f->header->n_entries),
2034 /* Calculate new index */
2036 if ((uint64_t) -skip >= i)
2039 i = i - (uint64_t) -skip;
2041 i += (uint64_t) skip;
2043 n = le64toh(f->header->n_entries);
2050 return generic_array_get(f,
2051 le64toh(f->header->entry_array_offset),
2056 int journal_file_next_entry_for_data(
2058 Object *o, uint64_t p,
2059 uint64_t data_offset,
2060 direction_t direction,
2061 Object **ret, uint64_t *offset) {
2068 assert(p > 0 || !o);
2070 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2074 n = le64toh(d->data.n_entries);
2079 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2081 if (o->object.type != OBJECT_ENTRY)
2084 r = generic_array_bisect_plus_one(f,
2085 le64toh(d->data.entry_offset),
2086 le64toh(d->data.entry_array_offset),
2087 le64toh(d->data.n_entries),
2097 if (direction == DIRECTION_DOWN) {
2111 return generic_array_get_plus_one(f,
2112 le64toh(d->data.entry_offset),
2113 le64toh(d->data.entry_array_offset),
2118 int journal_file_move_to_entry_by_offset_for_data(
2120 uint64_t data_offset,
2122 direction_t direction,
2123 Object **ret, uint64_t *offset) {
2130 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2134 return generic_array_bisect_plus_one(f,
2135 le64toh(d->data.entry_offset),
2136 le64toh(d->data.entry_array_offset),
2137 le64toh(d->data.n_entries),
2144 int journal_file_move_to_entry_by_monotonic_for_data(
2146 uint64_t data_offset,
2149 direction_t direction,
2150 Object **ret, uint64_t *offset) {
2158 /* First, seek by time */
2159 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2165 r = generic_array_bisect_plus_one(f,
2166 le64toh(o->data.entry_offset),
2167 le64toh(o->data.entry_array_offset),
2168 le64toh(o->data.n_entries),
2170 test_object_monotonic,
2176 /* And now, continue seeking until we find an entry that
2177 * exists in both bisection arrays */
2183 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2187 r = generic_array_bisect_plus_one(f,
2188 le64toh(d->data.entry_offset),
2189 le64toh(d->data.entry_array_offset),
2190 le64toh(d->data.n_entries),
2198 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2202 r = generic_array_bisect_plus_one(f,
2203 le64toh(o->data.entry_offset),
2204 le64toh(o->data.entry_array_offset),
2205 le64toh(o->data.n_entries),
2227 int journal_file_move_to_entry_by_seqnum_for_data(
2229 uint64_t data_offset,
2231 direction_t direction,
2232 Object **ret, uint64_t *offset) {
2239 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2243 return generic_array_bisect_plus_one(f,
2244 le64toh(d->data.entry_offset),
2245 le64toh(d->data.entry_array_offset),
2246 le64toh(d->data.n_entries),
2253 int journal_file_move_to_entry_by_realtime_for_data(
2255 uint64_t data_offset,
2257 direction_t direction,
2258 Object **ret, uint64_t *offset) {
2265 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2269 return generic_array_bisect_plus_one(f,
2270 le64toh(d->data.entry_offset),
2271 le64toh(d->data.entry_array_offset),
2272 le64toh(d->data.n_entries),
2274 test_object_realtime,
2279 void journal_file_dump(JournalFile *f) {
2286 journal_file_print_header(f);
2288 p = le64toh(f->header->header_size);
2290 r = journal_file_move_to_object(f, -1, p, &o);
2294 switch (o->object.type) {
2297 printf("Type: OBJECT_UNUSED\n");
2301 printf("Type: OBJECT_DATA\n");
2305 printf("Type: OBJECT_FIELD\n");
2309 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2310 le64toh(o->entry.seqnum),
2311 le64toh(o->entry.monotonic),
2312 le64toh(o->entry.realtime));
2315 case OBJECT_FIELD_HASH_TABLE:
2316 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2319 case OBJECT_DATA_HASH_TABLE:
2320 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2323 case OBJECT_ENTRY_ARRAY:
2324 printf("Type: OBJECT_ENTRY_ARRAY\n");
2328 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2329 le64toh(o->tag.seqnum),
2330 le64toh(o->tag.epoch));
2334 printf("Type: unknown (%u)\n", o->object.type);
2338 if (o->object.flags & OBJECT_COMPRESSED)
2339 printf("Flags: COMPRESSED\n");
2341 if (p == le64toh(f->header->tail_object_offset))
2344 p = p + ALIGN64(le64toh(o->object.size));
2349 log_error("File corrupt");
2352 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2355 x = format_timestamp(buf, l, t);
2361 void journal_file_print_header(JournalFile *f) {
2362 char a[33], b[33], c[33], d[33];
2363 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2365 char bytes[FORMAT_BYTES_MAX];
2369 printf("File Path: %s\n"
2373 "Sequential Number ID: %s\n"
2375 "Compatible Flags:%s%s\n"
2376 "Incompatible Flags:%s%s\n"
2377 "Header size: %"PRIu64"\n"
2378 "Arena size: %"PRIu64"\n"
2379 "Data Hash Table Size: %"PRIu64"\n"
2380 "Field Hash Table Size: %"PRIu64"\n"
2381 "Rotate Suggested: %s\n"
2382 "Head Sequential Number: %"PRIu64"\n"
2383 "Tail Sequential Number: %"PRIu64"\n"
2384 "Head Realtime Timestamp: %s\n"
2385 "Tail Realtime Timestamp: %s\n"
2386 "Tail Monotonic Timestamp: %s\n"
2387 "Objects: %"PRIu64"\n"
2388 "Entry Objects: %"PRIu64"\n",
2390 sd_id128_to_string(f->header->file_id, a),
2391 sd_id128_to_string(f->header->machine_id, b),
2392 sd_id128_to_string(f->header->boot_id, c),
2393 sd_id128_to_string(f->header->seqnum_id, d),
2394 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2395 f->header->state == STATE_ONLINE ? "ONLINE" :
2396 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2397 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2398 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2399 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2400 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2401 le64toh(f->header->header_size),
2402 le64toh(f->header->arena_size),
2403 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2404 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2405 yes_no(journal_file_rotate_suggested(f, 0)),
2406 le64toh(f->header->head_entry_seqnum),
2407 le64toh(f->header->tail_entry_seqnum),
2408 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2409 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2410 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2411 le64toh(f->header->n_objects),
2412 le64toh(f->header->n_entries));
2414 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2415 printf("Data Objects: %"PRIu64"\n"
2416 "Data Hash Table Fill: %.1f%%\n",
2417 le64toh(f->header->n_data),
2418 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2420 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2421 printf("Field Objects: %"PRIu64"\n"
2422 "Field Hash Table Fill: %.1f%%\n",
2423 le64toh(f->header->n_fields),
2424 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2427 printf("Tag Objects: %"PRIu64"\n",
2428 le64toh(f->header->n_tags));
2429 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2430 printf("Entry Array Objects: %"PRIu64"\n",
2431 le64toh(f->header->n_entry_arrays));
2433 if (fstat(f->fd, &st) >= 0)
2434 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2437 int journal_file_open(
2443 JournalMetrics *metrics,
2444 MMapCache *mmap_cache,
2445 JournalFile *template,
2446 JournalFile **ret) {
2450 bool newly_created = false;
2455 if ((flags & O_ACCMODE) != O_RDONLY &&
2456 (flags & O_ACCMODE) != O_RDWR)
2459 if (!endswith(fname, ".journal") &&
2460 !endswith(fname, ".journal~"))
2463 f = new0(JournalFile, 1);
2471 f->prot = prot_from_flags(flags);
2472 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2474 f->compress = compress;
2481 f->mmap = mmap_cache_ref(mmap_cache);
2483 f->mmap = mmap_cache_new();
2490 f->path = strdup(fname);
2496 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2497 if (!f->chain_cache) {
2502 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2508 if (fstat(f->fd, &f->last_stat) < 0) {
2513 if (f->last_stat.st_size == 0 && f->writable) {
2517 /* Let's attach the creation time to the journal file,
2518 * so that the vacuuming code knows the age of this
2519 * file even if the file might end up corrupted one
2520 * day... Ideally we'd just use the creation time many
2521 * file systems maintain for each file, but there is
2522 * currently no usable API to query this, hence let's
2523 * emulate this via extended attributes. If extended
2524 * attributes are not supported we'll just skip this,
2525 * and rely solely on mtime/atime/ctime of the file.*/
2527 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2528 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2532 /* Try to load the FSPRG state, and if we can't, then
2533 * just don't do sealing */
2535 r = journal_file_fss_load(f);
2541 r = journal_file_init_header(f, template);
2545 if (fstat(f->fd, &f->last_stat) < 0) {
2550 newly_created = true;
2553 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2558 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2559 if (f->header == MAP_FAILED) {
2565 if (!newly_created) {
2566 r = journal_file_verify_header(f);
2572 if (!newly_created && f->writable) {
2573 r = journal_file_fss_load(f);
2581 journal_default_metrics(metrics, f->fd);
2582 f->metrics = *metrics;
2583 } else if (template)
2584 f->metrics = template->metrics;
2586 r = journal_file_refresh_header(f);
2592 r = journal_file_hmac_setup(f);
2597 if (newly_created) {
2598 r = journal_file_setup_field_hash_table(f);
2602 r = journal_file_setup_data_hash_table(f);
2607 r = journal_file_append_first_tag(f);
2613 r = journal_file_map_field_hash_table(f);
2617 r = journal_file_map_data_hash_table(f);
2625 journal_file_close(f);
2630 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2631 _cleanup_free_ char *p = NULL;
2633 JournalFile *old_file, *new_file = NULL;
2641 if (!old_file->writable)
2644 if (!endswith(old_file->path, ".journal"))
2647 l = strlen(old_file->path);
2648 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2649 (int) l - 8, old_file->path,
2650 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2651 le64toh((*f)->header->head_entry_seqnum),
2652 le64toh((*f)->header->head_entry_realtime));
2656 r = rename(old_file->path, p);
2660 old_file->header->state = STATE_ARCHIVED;
2662 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2663 journal_file_close(old_file);
2669 int journal_file_open_reliably(
2675 JournalMetrics *metrics,
2676 MMapCache *mmap_cache,
2677 JournalFile *template,
2678 JournalFile **ret) {
2682 _cleanup_free_ char *p = NULL;
2684 r = journal_file_open(fname, flags, mode, compress, seal,
2685 metrics, mmap_cache, template, ret);
2686 if (r != -EBADMSG && /* corrupted */
2687 r != -ENODATA && /* truncated */
2688 r != -EHOSTDOWN && /* other machine */
2689 r != -EPROTONOSUPPORT && /* incompatible feature */
2690 r != -EBUSY && /* unclean shutdown */
2691 r != -ESHUTDOWN /* already archived */)
2694 if ((flags & O_ACCMODE) == O_RDONLY)
2697 if (!(flags & O_CREAT))
2700 if (!endswith(fname, ".journal"))
2703 /* The file is corrupted. Rotate it away and try it again (but only once) */
2706 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2708 (unsigned long long) now(CLOCK_REALTIME),
2712 r = rename(fname, p);
2716 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2718 return journal_file_open(fname, flags, mode, compress, seal,
2719 metrics, mmap_cache, template, ret);
2722 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2724 uint64_t q, xor_hash = 0;
2737 ts.monotonic = le64toh(o->entry.monotonic);
2738 ts.realtime = le64toh(o->entry.realtime);
2740 n = journal_file_entry_n_items(o);
2741 /* alloca() can't take 0, hence let's allocate at least one */
2742 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2744 for (i = 0; i < n; i++) {
2751 q = le64toh(o->entry.items[i].object_offset);
2752 le_hash = o->entry.items[i].hash;
2754 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2758 if (le_hash != o->data.hash)
2761 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2764 /* We hit the limit on 32bit machines */
2765 if ((uint64_t) t != l)
2768 if (o->object.flags & OBJECT_COMPRESSED) {
2772 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2775 data = from->compress_buffer;
2778 return -EPROTONOSUPPORT;
2781 data = o->data.payload;
2783 r = journal_file_append_data(to, data, l, &u, &h);
2787 xor_hash ^= le64toh(u->data.hash);
2788 items[i].object_offset = htole64(h);
2789 items[i].hash = u->data.hash;
2791 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2796 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2799 void journal_default_metrics(JournalMetrics *m, int fd) {
2800 uint64_t fs_size = 0;
2802 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2807 if (fstatvfs(fd, &ss) >= 0)
2808 fs_size = ss.f_frsize * ss.f_blocks;
2810 if (m->max_use == (uint64_t) -1) {
2813 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2815 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2816 m->max_use = DEFAULT_MAX_USE_UPPER;
2818 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2819 m->max_use = DEFAULT_MAX_USE_LOWER;
2821 m->max_use = DEFAULT_MAX_USE_LOWER;
2823 m->max_use = PAGE_ALIGN(m->max_use);
2825 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2826 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2829 if (m->max_size == (uint64_t) -1) {
2830 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2832 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2833 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2835 m->max_size = PAGE_ALIGN(m->max_size);
2837 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2838 m->max_size = JOURNAL_FILE_SIZE_MIN;
2840 if (m->max_size*2 > m->max_use)
2841 m->max_use = m->max_size*2;
2843 if (m->min_size == (uint64_t) -1)
2844 m->min_size = JOURNAL_FILE_SIZE_MIN;
2846 m->min_size = PAGE_ALIGN(m->min_size);
2848 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2849 m->min_size = JOURNAL_FILE_SIZE_MIN;
2851 if (m->min_size > m->max_size)
2852 m->max_size = m->min_size;
2855 if (m->keep_free == (uint64_t) -1) {
2858 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2860 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2861 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2864 m->keep_free = DEFAULT_KEEP_FREE;
2867 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2868 format_bytes(a, sizeof(a), m->max_use),
2869 format_bytes(b, sizeof(b), m->max_size),
2870 format_bytes(c, sizeof(c), m->min_size),
2871 format_bytes(d, sizeof(d), m->keep_free));
2874 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2879 if (f->header->head_entry_realtime == 0)
2882 *from = le64toh(f->header->head_entry_realtime);
2886 if (f->header->tail_entry_realtime == 0)
2889 *to = le64toh(f->header->tail_entry_realtime);
2895 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2903 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2907 if (le64toh(o->data.n_entries) <= 0)
2911 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2915 *from = le64toh(o->entry.monotonic);
2919 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2923 r = generic_array_get_plus_one(f,
2924 le64toh(o->data.entry_offset),
2925 le64toh(o->data.entry_array_offset),
2926 le64toh(o->data.n_entries)-1,
2931 *to = le64toh(o->entry.monotonic);
2937 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2940 /* If we gained new header fields we gained new features,
2941 * hence suggest a rotation */
2942 if (le64toh(f->header->header_size) < sizeof(Header)) {
2943 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2947 /* Let's check if the hash tables grew over a certain fill
2948 * level (75%, borrowing this value from Java's hash table
2949 * implementation), and if so suggest a rotation. To calculate
2950 * the fill level we need the n_data field, which only exists
2951 * in newer versions. */
2953 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2954 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2955 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2957 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2958 le64toh(f->header->n_data),
2959 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2960 (unsigned long long) f->last_stat.st_size,
2961 f->last_stat.st_size / le64toh(f->header->n_data));
2965 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2966 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2967 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2969 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2970 le64toh(f->header->n_fields),
2971 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2975 /* Are the data objects properly indexed by field objects? */
2976 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2977 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2978 le64toh(f->header->n_data) > 0 &&
2979 le64toh(f->header->n_fields) == 0)
2982 if (max_file_usec > 0) {
2985 h = le64toh(f->header->head_entry_realtime);
2986 t = now(CLOCK_REALTIME);
2988 if (h > 0 && t > h + max_file_usec)