1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
29 #include <sys/xattr.h>
31 #include "journal-def.h"
32 #include "journal-file.h"
33 #include "journal-authenticate.h"
38 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
43 /* This is the minimum journal file size */
44 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
46 /* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51 /* This is the upper bound if we deduce max_size from max_use */
52 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
54 /* This is the upper bound if we deduce the keep_free value from the
56 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58 /* This is the keep_free value when we can't determine the system
60 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62 /* n_data was the first entry we added after the initial file format design */
63 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
65 /* How many entries to keep in the entry array chain cache at max */
66 #define CHAIN_CACHE_MAX 20
68 /* How much to increase the journal file size at once each time we allocate something new. */
69 #define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71 static int journal_file_set_online(JournalFile *f) {
77 if (!(f->fd >= 0 && f->header))
80 switch(f->header->state) {
85 f->header->state = STATE_ONLINE;
94 int journal_file_set_offline(JournalFile *f) {
100 if (!(f->fd >= 0 && f->header))
103 if (f->header->state != STATE_ONLINE)
108 f->header->state = STATE_OFFLINE;
115 void journal_file_close(JournalFile *f) {
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
128 journal_file_set_offline(f);
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
137 mmap_cache_unref(f->mmap);
139 hashmap_free_free(f->chain_cache);
142 free(f->compress_buffer);
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
154 gcry_md_close(f->hmac);
160 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
168 memcpy(h.signature, HEADER_SIGNATURE, 8);
169 h.header_size = htole64(ALIGN64(sizeof(h)));
171 h.incompatible_flags =
172 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
175 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
177 r = sd_id128_randomize(&h.file_id);
182 h.seqnum_id = template->header->seqnum_id;
183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
185 h.seqnum_id = h.file_id;
187 k = pwrite(f->fd, &h, sizeof(h), 0);
197 static int journal_file_refresh_header(JournalFile *f) {
203 r = sd_id128_get_machine(&f->header->machine_id);
207 r = sd_id128_get_boot(&boot_id);
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
214 f->header->boot_id = boot_id;
216 journal_file_set_online(f);
218 /* Sync the online state to disk */
224 static int journal_file_verify_header(JournalFile *f) {
227 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
230 /* In both read and write mode we refuse to open files with
231 * incompatible flags we don't know */
233 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
234 return -EPROTONOSUPPORT;
236 if (f->header->incompatible_flags != 0)
237 return -EPROTONOSUPPORT;
240 /* When open for writing we refuse to open files with
241 * compatible flags, too */
244 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
245 return -EPROTONOSUPPORT;
247 if (f->header->compatible_flags != 0)
248 return -EPROTONOSUPPORT;
252 if (f->header->state >= _STATE_MAX)
255 /* The first addition was n_data, so check that we are at least this large */
256 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
259 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
262 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
265 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
268 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
269 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
270 !VALID64(le64toh(f->header->tail_object_offset)) ||
271 !VALID64(le64toh(f->header->entry_array_offset)))
274 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
275 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
276 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
277 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
282 sd_id128_t machine_id;
285 r = sd_id128_get_machine(&machine_id);
289 if (!sd_id128_equal(machine_id, f->header->machine_id))
292 state = f->header->state;
294 if (state == STATE_ONLINE) {
295 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
297 } else if (state == STATE_ARCHIVED)
299 else if (state != STATE_OFFLINE) {
300 log_debug("Journal file %s has unknown state %u.", f->path, state);
305 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
307 f->seal = JOURNAL_HEADER_SEALED(f->header);
312 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
313 uint64_t old_size, new_size;
318 /* We assume that this file is not sparse, and we know that
319 * for sure, since we always call posix_fallocate()
323 le64toh(f->header->header_size) +
324 le64toh(f->header->arena_size);
326 new_size = PAGE_ALIGN(offset + size);
327 if (new_size < le64toh(f->header->header_size))
328 new_size = le64toh(f->header->header_size);
330 if (new_size <= old_size)
333 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
336 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
339 if (fstatvfs(f->fd, &svfs) >= 0) {
342 available = svfs.f_bfree * svfs.f_bsize;
344 if (available >= f->metrics.keep_free)
345 available -= f->metrics.keep_free;
349 if (new_size - old_size > available)
354 /* Increase by larger blocks at once */
355 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
356 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
357 new_size = f->metrics.max_size;
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (fstat(f->fd, &f->last_stat) < 0)
369 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
381 /* Avoid SIGBUS on invalid accesses */
382 if (offset + size > (uint64_t) f->last_stat.st_size) {
383 /* Hmm, out of range? Let's refresh the fstat() data
384 * first, before we trust that check. */
386 if (fstat(f->fd, &f->last_stat) < 0 ||
387 offset + size > (uint64_t) f->last_stat.st_size)
388 return -EADDRNOTAVAIL;
391 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
394 static uint64_t minimum_header_size(Object *o) {
396 static const uint64_t table[] = {
397 [OBJECT_DATA] = sizeof(DataObject),
398 [OBJECT_FIELD] = sizeof(FieldObject),
399 [OBJECT_ENTRY] = sizeof(EntryObject),
400 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403 [OBJECT_TAG] = sizeof(TagObject),
406 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407 return sizeof(ObjectHeader);
409 return table[o->object.type];
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
421 /* Objects may only be located at multiple of 64 bit */
422 if (!VALID64(offset))
426 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
431 s = le64toh(o->object.size);
433 if (s < sizeof(ObjectHeader))
436 if (o->object.type <= OBJECT_UNUSED)
439 if (s < minimum_header_size(o))
442 if (type > 0 && o->object.type != type)
445 if (s > sizeof(ObjectHeader)) {
446 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
457 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
462 r = le64toh(f->header->tail_entry_seqnum) + 1;
465 /* If an external seqnum counter was passed, we update
466 * both the local and the external one, and set it to
467 * the maximum of both */
475 f->header->tail_entry_seqnum = htole64(r);
477 if (f->header->head_entry_seqnum == 0)
478 f->header->head_entry_seqnum = htole64(r);
483 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
490 assert(type > 0 && type < _OBJECT_TYPE_MAX);
491 assert(size >= sizeof(ObjectHeader));
495 r = journal_file_set_online(f);
499 p = le64toh(f->header->tail_object_offset);
501 p = le64toh(f->header->header_size);
503 r = journal_file_move_to_object(f, -1, p, &tail);
507 p += ALIGN64(le64toh(tail->object.size));
510 r = journal_file_allocate(f, p, size);
514 r = journal_file_move_to(f, type, false, p, size, &t);
521 o->object.type = type;
522 o->object.size = htole64(size);
524 f->header->tail_object_offset = htole64(p);
525 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
533 static int journal_file_setup_data_hash_table(JournalFile *f) {
540 /* We estimate that we need 1 hash table entry per 768 of
541 journal file and we want to make sure we never get beyond
542 75% fill level. Calculate the hash table size for the
543 maximum file size based on these metrics. */
545 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
546 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
547 s = DEFAULT_DATA_HASH_TABLE_SIZE;
549 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
551 r = journal_file_append_object(f,
552 OBJECT_DATA_HASH_TABLE,
553 offsetof(Object, hash_table.items) + s,
558 memzero(o->hash_table.items, s);
560 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
561 f->header->data_hash_table_size = htole64(s);
566 static int journal_file_setup_field_hash_table(JournalFile *f) {
573 /* We use a fixed size hash table for the fields as this
574 * number should grow very slowly only */
576 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
577 r = journal_file_append_object(f,
578 OBJECT_FIELD_HASH_TABLE,
579 offsetof(Object, hash_table.items) + s,
584 memzero(o->hash_table.items, s);
586 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
587 f->header->field_hash_table_size = htole64(s);
592 static int journal_file_map_data_hash_table(JournalFile *f) {
599 p = le64toh(f->header->data_hash_table_offset);
600 s = le64toh(f->header->data_hash_table_size);
602 r = journal_file_move_to(f,
603 OBJECT_DATA_HASH_TABLE,
610 f->data_hash_table = t;
614 static int journal_file_map_field_hash_table(JournalFile *f) {
621 p = le64toh(f->header->field_hash_table_offset);
622 s = le64toh(f->header->field_hash_table_size);
624 r = journal_file_move_to(f,
625 OBJECT_FIELD_HASH_TABLE,
632 f->field_hash_table = t;
636 static int journal_file_link_field(
649 if (o->object.type != OBJECT_FIELD)
652 /* This might alter the window we are looking at */
654 o->field.next_hash_offset = o->field.head_data_offset = 0;
656 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
657 p = le64toh(f->field_hash_table[h].tail_hash_offset);
659 f->field_hash_table[h].head_hash_offset = htole64(offset);
661 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
665 o->field.next_hash_offset = htole64(offset);
668 f->field_hash_table[h].tail_hash_offset = htole64(offset);
670 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
671 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
676 static int journal_file_link_data(
689 if (o->object.type != OBJECT_DATA)
692 /* This might alter the window we are looking at */
694 o->data.next_hash_offset = o->data.next_field_offset = 0;
695 o->data.entry_offset = o->data.entry_array_offset = 0;
696 o->data.n_entries = 0;
698 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
699 p = le64toh(f->data_hash_table[h].tail_hash_offset);
701 /* Only entry in the hash table is easy */
702 f->data_hash_table[h].head_hash_offset = htole64(offset);
704 /* Move back to the previous data object, to patch in
707 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
711 o->data.next_hash_offset = htole64(offset);
714 f->data_hash_table[h].tail_hash_offset = htole64(offset);
716 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
717 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
722 int journal_file_find_field_object_with_hash(
724 const void *field, uint64_t size, uint64_t hash,
725 Object **ret, uint64_t *offset) {
727 uint64_t p, osize, h;
731 assert(field && size > 0);
733 osize = offsetof(Object, field.payload) + size;
735 if (f->header->field_hash_table_size == 0)
738 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
739 p = le64toh(f->field_hash_table[h].head_hash_offset);
744 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
748 if (le64toh(o->field.hash) == hash &&
749 le64toh(o->object.size) == osize &&
750 memcmp(o->field.payload, field, size) == 0) {
760 p = le64toh(o->field.next_hash_offset);
766 int journal_file_find_field_object(
768 const void *field, uint64_t size,
769 Object **ret, uint64_t *offset) {
774 assert(field && size > 0);
776 hash = hash64(field, size);
778 return journal_file_find_field_object_with_hash(f,
783 int journal_file_find_data_object_with_hash(
785 const void *data, uint64_t size, uint64_t hash,
786 Object **ret, uint64_t *offset) {
788 uint64_t p, osize, h;
792 assert(data || size == 0);
794 osize = offsetof(Object, data.payload) + size;
796 if (f->header->data_hash_table_size == 0)
799 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
800 p = le64toh(f->data_hash_table[h].head_hash_offset);
805 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
809 if (le64toh(o->data.hash) != hash)
812 if (o->object.flags & OBJECT_COMPRESSED) {
816 l = le64toh(o->object.size);
817 if (l <= offsetof(Object, data.payload))
820 l -= offsetof(Object, data.payload);
822 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
826 memcmp(f->compress_buffer, data, size) == 0) {
837 return -EPROTONOSUPPORT;
840 } else if (le64toh(o->object.size) == osize &&
841 memcmp(o->data.payload, data, size) == 0) {
853 p = le64toh(o->data.next_hash_offset);
859 int journal_file_find_data_object(
861 const void *data, uint64_t size,
862 Object **ret, uint64_t *offset) {
867 assert(data || size == 0);
869 hash = hash64(data, size);
871 return journal_file_find_data_object_with_hash(f,
876 static int journal_file_append_field(
878 const void *field, uint64_t size,
879 Object **ret, uint64_t *offset) {
887 assert(field && size > 0);
889 hash = hash64(field, size);
891 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
905 osize = offsetof(Object, field.payload) + size;
906 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
910 o->field.hash = htole64(hash);
911 memcpy(o->field.payload, field, size);
913 r = journal_file_link_field(f, o, p, hash);
917 /* The linking might have altered the window, so let's
918 * refresh our pointer */
919 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
924 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
938 static int journal_file_append_data(
940 const void *data, uint64_t size,
941 Object **ret, uint64_t *offset) {
947 bool compressed = false;
951 assert(data || size == 0);
953 hash = hash64(data, size);
955 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
969 osize = offsetof(Object, data.payload) + size;
970 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
974 o->data.hash = htole64(hash);
978 size >= COMPRESSION_SIZE_THRESHOLD) {
981 compressed = compress_blob(data, size, o->data.payload, &rsize);
984 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
985 o->object.flags |= OBJECT_COMPRESSED;
987 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
992 if (!compressed && size > 0)
993 memcpy(o->data.payload, data, size);
995 r = journal_file_link_data(f, o, p, hash);
999 /* The linking might have altered the window, so let's
1000 * refresh our pointer */
1001 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1008 eq = memchr(data, '=', size);
1009 if (eq && eq > data) {
1013 /* Create field object ... */
1014 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1018 /* ... and link it in. */
1019 o->data.next_field_offset = fo->field.head_data_offset;
1020 fo->field.head_data_offset = le64toh(p);
1024 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1038 uint64_t journal_file_entry_n_items(Object *o) {
1041 if (o->object.type != OBJECT_ENTRY)
1044 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1047 uint64_t journal_file_entry_array_n_items(Object *o) {
1050 if (o->object.type != OBJECT_ENTRY_ARRAY)
1053 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1056 uint64_t journal_file_hash_table_n_items(Object *o) {
1059 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1060 o->object.type != OBJECT_FIELD_HASH_TABLE)
1063 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1066 static int link_entry_into_array(JournalFile *f,
1071 uint64_t n = 0, ap = 0, q, i, a, hidx;
1079 a = le64toh(*first);
1080 i = hidx = le64toh(*idx);
1083 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1087 n = journal_file_entry_array_n_items(o);
1089 o->entry_array.items[i] = htole64(p);
1090 *idx = htole64(hidx + 1);
1096 a = le64toh(o->entry_array.next_entry_array_offset);
1107 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1108 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1114 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1119 o->entry_array.items[i] = htole64(p);
1122 *first = htole64(q);
1124 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1128 o->entry_array.next_entry_array_offset = htole64(q);
1131 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1132 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1134 *idx = htole64(hidx + 1);
1139 static int link_entry_into_array_plus_one(JournalFile *f,
1154 *extra = htole64(p);
1158 i = htole64(le64toh(*idx) - 1);
1159 r = link_entry_into_array(f, first, &i, p);
1164 *idx = htole64(le64toh(*idx) + 1);
1168 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1175 p = le64toh(o->entry.items[i].object_offset);
1179 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1183 return link_entry_into_array_plus_one(f,
1184 &o->data.entry_offset,
1185 &o->data.entry_array_offset,
1190 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1198 if (o->object.type != OBJECT_ENTRY)
1201 __sync_synchronize();
1203 /* Link up the entry itself */
1204 r = link_entry_into_array(f,
1205 &f->header->entry_array_offset,
1206 &f->header->n_entries,
1211 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1213 if (f->header->head_entry_realtime == 0)
1214 f->header->head_entry_realtime = o->entry.realtime;
1216 f->header->tail_entry_realtime = o->entry.realtime;
1217 f->header->tail_entry_monotonic = o->entry.monotonic;
1219 f->tail_entry_monotonic_valid = true;
1221 /* Link up the items */
1222 n = journal_file_entry_n_items(o);
1223 for (i = 0; i < n; i++) {
1224 r = journal_file_link_entry_item(f, o, offset, i);
1232 static int journal_file_append_entry_internal(
1234 const dual_timestamp *ts,
1236 const EntryItem items[], unsigned n_items,
1238 Object **ret, uint64_t *offset) {
1245 assert(items || n_items == 0);
1248 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1250 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1254 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1255 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1256 o->entry.realtime = htole64(ts->realtime);
1257 o->entry.monotonic = htole64(ts->monotonic);
1258 o->entry.xor_hash = htole64(xor_hash);
1259 o->entry.boot_id = f->header->boot_id;
1262 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1267 r = journal_file_link_entry(f, o, np);
1280 void journal_file_post_change(JournalFile *f) {
1283 /* inotify() does not receive IN_MODIFY events from file
1284 * accesses done via mmap(). After each access we hence
1285 * trigger IN_MODIFY by truncating the journal file to its
1286 * current size which triggers IN_MODIFY. */
1288 __sync_synchronize();
1290 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1291 log_error("Failed to truncate file to its own size: %m");
1294 static int entry_item_cmp(const void *_a, const void *_b) {
1295 const EntryItem *a = _a, *b = _b;
1297 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1299 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1304 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1308 uint64_t xor_hash = 0;
1309 struct dual_timestamp _ts;
1312 assert(iovec || n_iovec == 0);
1315 dual_timestamp_get(&_ts);
1319 if (f->tail_entry_monotonic_valid &&
1320 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1324 r = journal_file_maybe_append_tag(f, ts->realtime);
1329 /* alloca() can't take 0, hence let's allocate at least one */
1330 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1332 for (i = 0; i < n_iovec; i++) {
1336 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1340 xor_hash ^= le64toh(o->data.hash);
1341 items[i].object_offset = htole64(p);
1342 items[i].hash = o->data.hash;
1345 /* Order by the position on disk, in order to improve seek
1346 * times for rotating media. */
1347 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1349 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1351 journal_file_post_change(f);
1356 typedef struct ChainCacheItem {
1357 uint64_t first; /* the array at the beginning of the chain */
1358 uint64_t array; /* the cached array */
1359 uint64_t begin; /* the first item in the cached array */
1360 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1361 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
1364 static void chain_cache_put(
1371 uint64_t last_index) {
1374 /* If the chain item to cache for this chain is the
1375 * first one it's not worth caching anything */
1379 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1380 ci = hashmap_steal_first(h);
1382 ci = new(ChainCacheItem, 1);
1389 if (hashmap_put(h, &ci->first, ci) < 0) {
1394 assert(ci->first == first);
1399 ci->last_index = last_index;
1402 static int generic_array_get(
1406 Object **ret, uint64_t *offset) {
1409 uint64_t p = 0, a, t = 0;
1417 /* Try the chain cache first */
1418 ci = hashmap_get(f->chain_cache, &first);
1419 if (ci && i > ci->total) {
1428 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1432 k = journal_file_entry_array_n_items(o);
1434 p = le64toh(o->entry_array.items[i]);
1440 a = le64toh(o->entry_array.next_entry_array_offset);
1446 /* Let's cache this item for the next invocation */
1447 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
1449 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1462 static int generic_array_get_plus_one(
1467 Object **ret, uint64_t *offset) {
1476 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1489 return generic_array_get(f, first, i-1, ret, offset);
1498 static int generic_array_bisect(
1503 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1504 direction_t direction,
1509 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
1510 bool subtract_one = false;
1511 Object *o, *array = NULL;
1516 assert(test_object);
1518 /* Start with the first array in the chain */
1521 ci = hashmap_get(f->chain_cache, &first);
1522 if (ci && n > ci->total) {
1523 /* Ah, we have iterated this bisection array chain
1524 * previously! Let's see if we can skip ahead in the
1525 * chain, as far as the last time. But we can't jump
1526 * backwards in the chain, so let's check that
1529 r = test_object(f, ci->begin, needle);
1533 if (r == TEST_LEFT) {
1534 /* OK, what we are looking for is right of the
1535 * begin of this EntryArray, so let's jump
1536 * straight to previously cached array in the
1542 last_index = ci->last_index;
1547 uint64_t left, right, k, lp;
1549 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1553 k = journal_file_entry_array_n_items(array);
1559 lp = p = le64toh(array->entry_array.items[i]);
1563 r = test_object(f, p, needle);
1567 if (r == TEST_FOUND)
1568 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1570 if (r == TEST_RIGHT) {
1574 if (last_index != (uint64_t) -1) {
1575 assert(last_index <= right);
1577 /* If we cached the last index we
1578 * looked at, let's try to not to jump
1579 * too wildly around and see if we can
1580 * limit the range to look at early to
1581 * the immediate neighbors of the last
1582 * index we looked at. */
1584 if (last_index > 0) {
1585 uint64_t x = last_index - 1;
1587 p = le64toh(array->entry_array.items[x]);
1591 r = test_object(f, p, needle);
1595 if (r == TEST_FOUND)
1596 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1598 if (r == TEST_RIGHT)
1604 if (last_index < right) {
1605 uint64_t y = last_index + 1;
1607 p = le64toh(array->entry_array.items[y]);
1611 r = test_object(f, p, needle);
1615 if (r == TEST_FOUND)
1616 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1618 if (r == TEST_RIGHT)
1626 if (left == right) {
1627 if (direction == DIRECTION_UP)
1628 subtract_one = true;
1634 assert(left < right);
1635 i = (left + right) / 2;
1637 p = le64toh(array->entry_array.items[i]);
1641 r = test_object(f, p, needle);
1645 if (r == TEST_FOUND)
1646 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1648 if (r == TEST_RIGHT)
1656 if (direction == DIRECTION_UP) {
1658 subtract_one = true;
1669 last_index = (uint64_t) -1;
1670 a = le64toh(array->entry_array.next_entry_array_offset);
1676 if (subtract_one && t == 0 && i == 0)
1679 /* Let's cache this item for the next invocation */
1680 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
1682 if (subtract_one && i == 0)
1684 else if (subtract_one)
1685 p = le64toh(array->entry_array.items[i-1]);
1687 p = le64toh(array->entry_array.items[i]);
1689 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1700 *idx = t + i + (subtract_one ? -1 : 0);
1706 static int generic_array_bisect_plus_one(
1712 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1713 direction_t direction,
1719 bool step_back = false;
1723 assert(test_object);
1728 /* This bisects the array in object 'first', but first checks
1730 r = test_object(f, extra, needle);
1734 if (r == TEST_FOUND)
1735 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1737 /* if we are looking with DIRECTION_UP then we need to first
1738 see if in the actual array there is a matching entry, and
1739 return the last one of that. But if there isn't any we need
1740 to return this one. Hence remember this, and return it
1743 step_back = direction == DIRECTION_UP;
1745 if (r == TEST_RIGHT) {
1746 if (direction == DIRECTION_DOWN)
1752 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1754 if (r == 0 && step_back)
1763 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1779 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1785 else if (p < needle)
1791 int journal_file_move_to_entry_by_offset(
1794 direction_t direction,
1798 return generic_array_bisect(f,
1799 le64toh(f->header->entry_array_offset),
1800 le64toh(f->header->n_entries),
1808 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1815 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1819 if (le64toh(o->entry.seqnum) == needle)
1821 else if (le64toh(o->entry.seqnum) < needle)
1827 int journal_file_move_to_entry_by_seqnum(
1830 direction_t direction,
1834 return generic_array_bisect(f,
1835 le64toh(f->header->entry_array_offset),
1836 le64toh(f->header->n_entries),
1843 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1850 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1854 if (le64toh(o->entry.realtime) == needle)
1856 else if (le64toh(o->entry.realtime) < needle)
1862 int journal_file_move_to_entry_by_realtime(
1865 direction_t direction,
1869 return generic_array_bisect(f,
1870 le64toh(f->header->entry_array_offset),
1871 le64toh(f->header->n_entries),
1873 test_object_realtime,
1878 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1885 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1889 if (le64toh(o->entry.monotonic) == needle)
1891 else if (le64toh(o->entry.monotonic) < needle)
1897 static inline int find_data_object_by_boot_id(
1902 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1904 sd_id128_to_string(boot_id, t + 9);
1905 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1908 int journal_file_move_to_entry_by_monotonic(
1912 direction_t direction,
1921 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1927 return generic_array_bisect_plus_one(f,
1928 le64toh(o->data.entry_offset),
1929 le64toh(o->data.entry_array_offset),
1930 le64toh(o->data.n_entries),
1932 test_object_monotonic,
1937 int journal_file_next_entry(
1939 Object *o, uint64_t p,
1940 direction_t direction,
1941 Object **ret, uint64_t *offset) {
1947 assert(p > 0 || !o);
1949 n = le64toh(f->header->n_entries);
1954 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1956 if (o->object.type != OBJECT_ENTRY)
1959 r = generic_array_bisect(f,
1960 le64toh(f->header->entry_array_offset),
1961 le64toh(f->header->n_entries),
1970 if (direction == DIRECTION_DOWN) {
1983 /* And jump to it */
1984 r = generic_array_get(f,
1985 le64toh(f->header->entry_array_offset),
1992 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1993 log_debug("%s: entry array corrupted at entry %"PRIu64,
2004 int journal_file_skip_entry(
2006 Object *o, uint64_t p,
2008 Object **ret, uint64_t *offset) {
2017 if (o->object.type != OBJECT_ENTRY)
2020 r = generic_array_bisect(f,
2021 le64toh(f->header->entry_array_offset),
2022 le64toh(f->header->n_entries),
2031 /* Calculate new index */
2033 if ((uint64_t) -skip >= i)
2036 i = i - (uint64_t) -skip;
2038 i += (uint64_t) skip;
2040 n = le64toh(f->header->n_entries);
2047 return generic_array_get(f,
2048 le64toh(f->header->entry_array_offset),
2053 int journal_file_next_entry_for_data(
2055 Object *o, uint64_t p,
2056 uint64_t data_offset,
2057 direction_t direction,
2058 Object **ret, uint64_t *offset) {
2065 assert(p > 0 || !o);
2067 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2071 n = le64toh(d->data.n_entries);
2076 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2078 if (o->object.type != OBJECT_ENTRY)
2081 r = generic_array_bisect_plus_one(f,
2082 le64toh(d->data.entry_offset),
2083 le64toh(d->data.entry_array_offset),
2084 le64toh(d->data.n_entries),
2094 if (direction == DIRECTION_DOWN) {
2108 return generic_array_get_plus_one(f,
2109 le64toh(d->data.entry_offset),
2110 le64toh(d->data.entry_array_offset),
2115 int journal_file_move_to_entry_by_offset_for_data(
2117 uint64_t data_offset,
2119 direction_t direction,
2120 Object **ret, uint64_t *offset) {
2127 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2131 return generic_array_bisect_plus_one(f,
2132 le64toh(d->data.entry_offset),
2133 le64toh(d->data.entry_array_offset),
2134 le64toh(d->data.n_entries),
2141 int journal_file_move_to_entry_by_monotonic_for_data(
2143 uint64_t data_offset,
2146 direction_t direction,
2147 Object **ret, uint64_t *offset) {
2155 /* First, seek by time */
2156 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2162 r = generic_array_bisect_plus_one(f,
2163 le64toh(o->data.entry_offset),
2164 le64toh(o->data.entry_array_offset),
2165 le64toh(o->data.n_entries),
2167 test_object_monotonic,
2173 /* And now, continue seeking until we find an entry that
2174 * exists in both bisection arrays */
2180 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2184 r = generic_array_bisect_plus_one(f,
2185 le64toh(d->data.entry_offset),
2186 le64toh(d->data.entry_array_offset),
2187 le64toh(d->data.n_entries),
2195 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2199 r = generic_array_bisect_plus_one(f,
2200 le64toh(o->data.entry_offset),
2201 le64toh(o->data.entry_array_offset),
2202 le64toh(o->data.n_entries),
2224 int journal_file_move_to_entry_by_seqnum_for_data(
2226 uint64_t data_offset,
2228 direction_t direction,
2229 Object **ret, uint64_t *offset) {
2236 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2240 return generic_array_bisect_plus_one(f,
2241 le64toh(d->data.entry_offset),
2242 le64toh(d->data.entry_array_offset),
2243 le64toh(d->data.n_entries),
2250 int journal_file_move_to_entry_by_realtime_for_data(
2252 uint64_t data_offset,
2254 direction_t direction,
2255 Object **ret, uint64_t *offset) {
2262 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2266 return generic_array_bisect_plus_one(f,
2267 le64toh(d->data.entry_offset),
2268 le64toh(d->data.entry_array_offset),
2269 le64toh(d->data.n_entries),
2271 test_object_realtime,
2276 void journal_file_dump(JournalFile *f) {
2283 journal_file_print_header(f);
2285 p = le64toh(f->header->header_size);
2287 r = journal_file_move_to_object(f, -1, p, &o);
2291 switch (o->object.type) {
2294 printf("Type: OBJECT_UNUSED\n");
2298 printf("Type: OBJECT_DATA\n");
2302 printf("Type: OBJECT_FIELD\n");
2306 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2307 le64toh(o->entry.seqnum),
2308 le64toh(o->entry.monotonic),
2309 le64toh(o->entry.realtime));
2312 case OBJECT_FIELD_HASH_TABLE:
2313 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2316 case OBJECT_DATA_HASH_TABLE:
2317 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2320 case OBJECT_ENTRY_ARRAY:
2321 printf("Type: OBJECT_ENTRY_ARRAY\n");
2325 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2326 le64toh(o->tag.seqnum),
2327 le64toh(o->tag.epoch));
2331 printf("Type: unknown (%u)\n", o->object.type);
2335 if (o->object.flags & OBJECT_COMPRESSED)
2336 printf("Flags: COMPRESSED\n");
2338 if (p == le64toh(f->header->tail_object_offset))
2341 p = p + ALIGN64(le64toh(o->object.size));
2346 log_error("File corrupt");
2349 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2352 x = format_timestamp(buf, l, t);
2358 void journal_file_print_header(JournalFile *f) {
2359 char a[33], b[33], c[33], d[33];
2360 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2362 char bytes[FORMAT_BYTES_MAX];
2366 printf("File Path: %s\n"
2370 "Sequential Number ID: %s\n"
2372 "Compatible Flags:%s%s\n"
2373 "Incompatible Flags:%s%s\n"
2374 "Header size: %"PRIu64"\n"
2375 "Arena size: %"PRIu64"\n"
2376 "Data Hash Table Size: %"PRIu64"\n"
2377 "Field Hash Table Size: %"PRIu64"\n"
2378 "Rotate Suggested: %s\n"
2379 "Head Sequential Number: %"PRIu64"\n"
2380 "Tail Sequential Number: %"PRIu64"\n"
2381 "Head Realtime Timestamp: %s\n"
2382 "Tail Realtime Timestamp: %s\n"
2383 "Tail Monotonic Timestamp: %s\n"
2384 "Objects: %"PRIu64"\n"
2385 "Entry Objects: %"PRIu64"\n",
2387 sd_id128_to_string(f->header->file_id, a),
2388 sd_id128_to_string(f->header->machine_id, b),
2389 sd_id128_to_string(f->header->boot_id, c),
2390 sd_id128_to_string(f->header->seqnum_id, d),
2391 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2392 f->header->state == STATE_ONLINE ? "ONLINE" :
2393 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2394 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2395 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2396 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2397 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2398 le64toh(f->header->header_size),
2399 le64toh(f->header->arena_size),
2400 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2401 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2402 yes_no(journal_file_rotate_suggested(f, 0)),
2403 le64toh(f->header->head_entry_seqnum),
2404 le64toh(f->header->tail_entry_seqnum),
2405 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2406 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2407 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2408 le64toh(f->header->n_objects),
2409 le64toh(f->header->n_entries));
2411 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2412 printf("Data Objects: %"PRIu64"\n"
2413 "Data Hash Table Fill: %.1f%%\n",
2414 le64toh(f->header->n_data),
2415 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2417 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2418 printf("Field Objects: %"PRIu64"\n"
2419 "Field Hash Table Fill: %.1f%%\n",
2420 le64toh(f->header->n_fields),
2421 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2423 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2424 printf("Tag Objects: %"PRIu64"\n",
2425 le64toh(f->header->n_tags));
2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2427 printf("Entry Array Objects: %"PRIu64"\n",
2428 le64toh(f->header->n_entry_arrays));
2430 if (fstat(f->fd, &st) >= 0)
2431 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2434 int journal_file_open(
2440 JournalMetrics *metrics,
2441 MMapCache *mmap_cache,
2442 JournalFile *template,
2443 JournalFile **ret) {
2447 bool newly_created = false;
2452 if ((flags & O_ACCMODE) != O_RDONLY &&
2453 (flags & O_ACCMODE) != O_RDWR)
2456 if (!endswith(fname, ".journal") &&
2457 !endswith(fname, ".journal~"))
2460 f = new0(JournalFile, 1);
2468 f->prot = prot_from_flags(flags);
2469 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2471 f->compress = compress;
2478 f->mmap = mmap_cache_ref(mmap_cache);
2480 f->mmap = mmap_cache_new();
2487 f->path = strdup(fname);
2493 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2494 if (!f->chain_cache) {
2499 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2505 if (fstat(f->fd, &f->last_stat) < 0) {
2510 if (f->last_stat.st_size == 0 && f->writable) {
2513 /* Let's attach the creation time to the journal file,
2514 * so that the vacuuming code knows the age of this
2515 * file even if the file might end up corrupted one
2516 * day... Ideally we'd just use the creation time many
2517 * file systems maintain for each file, but there is
2518 * currently no usable API to query this, hence let's
2519 * emulate this via extended attributes. If extended
2520 * attributes are not supported we'll just skip this,
2521 * and rely solely on mtime/atime/ctime of the file.*/
2523 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2524 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2527 /* Try to load the FSPRG state, and if we can't, then
2528 * just don't do sealing */
2530 r = journal_file_fss_load(f);
2536 r = journal_file_init_header(f, template);
2540 if (fstat(f->fd, &f->last_stat) < 0) {
2545 newly_created = true;
2548 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2553 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2554 if (f->header == MAP_FAILED) {
2560 if (!newly_created) {
2561 r = journal_file_verify_header(f);
2567 if (!newly_created && f->writable) {
2568 r = journal_file_fss_load(f);
2576 journal_default_metrics(metrics, f->fd);
2577 f->metrics = *metrics;
2578 } else if (template)
2579 f->metrics = template->metrics;
2581 r = journal_file_refresh_header(f);
2587 r = journal_file_hmac_setup(f);
2592 if (newly_created) {
2593 r = journal_file_setup_field_hash_table(f);
2597 r = journal_file_setup_data_hash_table(f);
2602 r = journal_file_append_first_tag(f);
2608 r = journal_file_map_field_hash_table(f);
2612 r = journal_file_map_data_hash_table(f);
2620 journal_file_close(f);
2625 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2626 _cleanup_free_ char *p = NULL;
2628 JournalFile *old_file, *new_file = NULL;
2636 if (!old_file->writable)
2639 if (!endswith(old_file->path, ".journal"))
2642 l = strlen(old_file->path);
2643 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2644 (int) l - 8, old_file->path,
2645 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2646 le64toh((*f)->header->head_entry_seqnum),
2647 le64toh((*f)->header->head_entry_realtime));
2651 r = rename(old_file->path, p);
2655 old_file->header->state = STATE_ARCHIVED;
2657 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2658 journal_file_close(old_file);
2664 int journal_file_open_reliably(
2670 JournalMetrics *metrics,
2671 MMapCache *mmap_cache,
2672 JournalFile *template,
2673 JournalFile **ret) {
2677 _cleanup_free_ char *p = NULL;
2679 r = journal_file_open(fname, flags, mode, compress, seal,
2680 metrics, mmap_cache, template, ret);
2681 if (r != -EBADMSG && /* corrupted */
2682 r != -ENODATA && /* truncated */
2683 r != -EHOSTDOWN && /* other machine */
2684 r != -EPROTONOSUPPORT && /* incompatible feature */
2685 r != -EBUSY && /* unclean shutdown */
2686 r != -ESHUTDOWN /* already archived */)
2689 if ((flags & O_ACCMODE) == O_RDONLY)
2692 if (!(flags & O_CREAT))
2695 if (!endswith(fname, ".journal"))
2698 /* The file is corrupted. Rotate it away and try it again (but only once) */
2701 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
2703 (unsigned long long) now(CLOCK_REALTIME),
2707 r = rename(fname, p);
2711 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2713 return journal_file_open(fname, flags, mode, compress, seal,
2714 metrics, mmap_cache, template, ret);
2717 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2719 uint64_t q, xor_hash = 0;
2732 ts.monotonic = le64toh(o->entry.monotonic);
2733 ts.realtime = le64toh(o->entry.realtime);
2735 n = journal_file_entry_n_items(o);
2736 /* alloca() can't take 0, hence let's allocate at least one */
2737 items = alloca(sizeof(EntryItem) * MAX(1u, n));
2739 for (i = 0; i < n; i++) {
2746 q = le64toh(o->entry.items[i].object_offset);
2747 le_hash = o->entry.items[i].hash;
2749 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2753 if (le_hash != o->data.hash)
2756 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2759 /* We hit the limit on 32bit machines */
2760 if ((uint64_t) t != l)
2763 if (o->object.flags & OBJECT_COMPRESSED) {
2767 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2770 data = from->compress_buffer;
2773 return -EPROTONOSUPPORT;
2776 data = o->data.payload;
2778 r = journal_file_append_data(to, data, l, &u, &h);
2782 xor_hash ^= le64toh(u->data.hash);
2783 items[i].object_offset = htole64(h);
2784 items[i].hash = u->data.hash;
2786 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2791 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2794 void journal_default_metrics(JournalMetrics *m, int fd) {
2795 uint64_t fs_size = 0;
2797 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2802 if (fstatvfs(fd, &ss) >= 0)
2803 fs_size = ss.f_frsize * ss.f_blocks;
2805 if (m->max_use == (uint64_t) -1) {
2808 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2810 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2811 m->max_use = DEFAULT_MAX_USE_UPPER;
2813 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2814 m->max_use = DEFAULT_MAX_USE_LOWER;
2816 m->max_use = DEFAULT_MAX_USE_LOWER;
2818 m->max_use = PAGE_ALIGN(m->max_use);
2820 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2821 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2824 if (m->max_size == (uint64_t) -1) {
2825 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2827 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2828 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2830 m->max_size = PAGE_ALIGN(m->max_size);
2832 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2833 m->max_size = JOURNAL_FILE_SIZE_MIN;
2835 if (m->max_size*2 > m->max_use)
2836 m->max_use = m->max_size*2;
2838 if (m->min_size == (uint64_t) -1)
2839 m->min_size = JOURNAL_FILE_SIZE_MIN;
2841 m->min_size = PAGE_ALIGN(m->min_size);
2843 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2844 m->min_size = JOURNAL_FILE_SIZE_MIN;
2846 if (m->min_size > m->max_size)
2847 m->max_size = m->min_size;
2850 if (m->keep_free == (uint64_t) -1) {
2853 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2855 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2856 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2859 m->keep_free = DEFAULT_KEEP_FREE;
2862 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2863 format_bytes(a, sizeof(a), m->max_use),
2864 format_bytes(b, sizeof(b), m->max_size),
2865 format_bytes(c, sizeof(c), m->min_size),
2866 format_bytes(d, sizeof(d), m->keep_free));
2869 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2874 if (f->header->head_entry_realtime == 0)
2877 *from = le64toh(f->header->head_entry_realtime);
2881 if (f->header->tail_entry_realtime == 0)
2884 *to = le64toh(f->header->tail_entry_realtime);
2890 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2898 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2902 if (le64toh(o->data.n_entries) <= 0)
2906 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2910 *from = le64toh(o->entry.monotonic);
2914 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2918 r = generic_array_get_plus_one(f,
2919 le64toh(o->data.entry_offset),
2920 le64toh(o->data.entry_array_offset),
2921 le64toh(o->data.n_entries)-1,
2926 *to = le64toh(o->entry.monotonic);
2932 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2935 /* If we gained new header fields we gained new features,
2936 * hence suggest a rotation */
2937 if (le64toh(f->header->header_size) < sizeof(Header)) {
2938 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2942 /* Let's check if the hash tables grew over a certain fill
2943 * level (75%, borrowing this value from Java's hash table
2944 * implementation), and if so suggest a rotation. To calculate
2945 * the fill level we need the n_data field, which only exists
2946 * in newer versions. */
2948 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2949 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2950 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2952 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2953 le64toh(f->header->n_data),
2954 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2955 (unsigned long long) f->last_stat.st_size,
2956 f->last_stat.st_size / le64toh(f->header->n_data));
2960 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2961 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2962 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2964 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2965 le64toh(f->header->n_fields),
2966 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2970 /* Are the data objects properly indexed by field objects? */
2971 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2972 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2973 le64toh(f->header->n_data) > 0 &&
2974 le64toh(f->header->n_fields) == 0)
2977 if (max_file_usec > 0) {
2980 h = le64toh(f->header->head_entry_realtime);
2981 t = now(CLOCK_REALTIME);
2983 if (h > 0 && t > h + max_file_usec)