1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 int journal_file_set_online(JournalFile *f) {
77 if (!(f->fd >= 0 && f->header))
80 switch(f->header->state) {
85 f->header->state = STATE_ONLINE;
94 int journal_file_set_offline(JournalFile *f) {
100 if (!(f->fd >= 0 && f->header))
103 if (f->header->state != STATE_ONLINE)
108 f->header->state = STATE_OFFLINE;
115 void journal_file_close(JournalFile *f) {
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
128 journal_file_set_offline(f);
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
134 close_nointr_nofail(f->fd);
139 mmap_cache_unref(f->mmap);
141 hashmap_free_free(f->chain_cache);
144 free(f->compress_buffer);
149 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
150 else if (f->fsprg_state)
151 free(f->fsprg_state);
156 gcry_md_close(f->hmac);
162 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
170 memcpy(h.signature, HEADER_SIGNATURE, 8);
171 h.header_size = htole64(ALIGN64(sizeof(h)));
173 h.incompatible_flags =
174 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
177 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
179 r = sd_id128_randomize(&h.file_id);
184 h.seqnum_id = template->header->seqnum_id;
185 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
187 h.seqnum_id = h.file_id;
189 k = pwrite(f->fd, &h, sizeof(h), 0);
199 static int journal_file_refresh_header(JournalFile *f) {
205 r = sd_id128_get_machine(&f->header->machine_id);
209 r = sd_id128_get_boot(&boot_id);
213 if (sd_id128_equal(boot_id, f->header->boot_id))
214 f->tail_entry_monotonic_valid = true;
216 f->header->boot_id = boot_id;
218 journal_file_set_online(f);
220 /* Sync the online state to disk */
221 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
227 static int journal_file_verify_header(JournalFile *f) {
230 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233 /* In both read and write mode we refuse to open files with
234 * incompatible flags we don't know */
236 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
237 return -EPROTONOSUPPORT;
239 if (f->header->incompatible_flags != 0)
240 return -EPROTONOSUPPORT;
243 /* When open for writing we refuse to open files with
244 * compatible flags, too */
247 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
248 return -EPROTONOSUPPORT;
250 if (f->header->compatible_flags != 0)
251 return -EPROTONOSUPPORT;
255 if (f->header->state >= _STATE_MAX)
258 /* The first addition was n_data, so check that we are at least this large */
259 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
272 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
273 !VALID64(le64toh(f->header->tail_object_offset)) ||
274 !VALID64(le64toh(f->header->entry_array_offset)))
277 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
278 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
279 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
285 sd_id128_t machine_id;
288 r = sd_id128_get_machine(&machine_id);
292 if (!sd_id128_equal(machine_id, f->header->machine_id))
295 state = f->header->state;
297 if (state == STATE_ONLINE) {
298 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
300 } else if (state == STATE_ARCHIVED)
302 else if (state != STATE_OFFLINE) {
303 log_debug("Journal file %s has unknown state %u.", f->path, state);
308 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
333 if (new_size <= old_size)
336 if (f->metrics.max_size > 0 &&
337 new_size > f->metrics.max_size)
340 if (new_size > f->metrics.min_size &&
341 f->metrics.keep_free > 0) {
344 if (fstatvfs(f->fd, &svfs) >= 0) {
347 available = svfs.f_bfree * svfs.f_bsize;
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
354 if (new_size - old_size > available)
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (fstat(f->fd, &f->last_stat) < 0)
369 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
381 /* Avoid SIGBUS on invalid accesses */
382 if (offset + size > (uint64_t) f->last_stat.st_size) {
383 /* Hmm, out of range? Let's refresh the fstat() data
384 * first, before we trust that check. */
386 if (fstat(f->fd, &f->last_stat) < 0 ||
387 offset + size > (uint64_t) f->last_stat.st_size)
388 return -EADDRNOTAVAIL;
391 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
394 static uint64_t minimum_header_size(Object *o) {
396 static uint64_t table[] = {
397 [OBJECT_DATA] = sizeof(DataObject),
398 [OBJECT_FIELD] = sizeof(FieldObject),
399 [OBJECT_ENTRY] = sizeof(EntryObject),
400 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403 [OBJECT_TAG] = sizeof(TagObject),
406 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407 return sizeof(ObjectHeader);
409 return table[o->object.type];
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
422 /* Objects may only be located at multiple of 64 bit */
423 if (!VALID64(offset))
426 /* One context for each type, plus one catch-all for the rest */
427 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
429 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
434 s = le64toh(o->object.size);
436 if (s < sizeof(ObjectHeader))
439 if (o->object.type <= OBJECT_UNUSED)
442 if (s < minimum_header_size(o))
445 if (type > 0 && o->object.type != type)
448 if (s > sizeof(ObjectHeader)) {
449 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
465 r = le64toh(f->header->tail_entry_seqnum) + 1;
468 /* If an external seqnum counter was passed, we update
469 * both the local and the external one, and set it to
470 * the maximum of both */
478 f->header->tail_entry_seqnum = htole64(r);
480 if (f->header->head_entry_seqnum == 0)
481 f->header->head_entry_seqnum = htole64(r);
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
493 assert(type > 0 && type < _OBJECT_TYPE_MAX);
494 assert(size >= sizeof(ObjectHeader));
498 r = journal_file_set_online(f);
502 p = le64toh(f->header->tail_object_offset);
504 p = le64toh(f->header->header_size);
506 r = journal_file_move_to_object(f, -1, p, &tail);
510 p += ALIGN64(le64toh(tail->object.size));
513 r = journal_file_allocate(f, p, size);
517 r = journal_file_move_to(f, type, false, p, size, &t);
524 o->object.type = type;
525 o->object.size = htole64(size);
527 f->header->tail_object_offset = htole64(p);
528 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
543 /* We estimate that we need 1 hash table entry per 768 of
544 journal file and we want to make sure we never get beyond
545 75% fill level. Calculate the hash table size for the
546 maximum file size based on these metrics. */
548 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550 s = DEFAULT_DATA_HASH_TABLE_SIZE;
552 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
554 r = journal_file_append_object(f,
555 OBJECT_DATA_HASH_TABLE,
556 offsetof(Object, hash_table.items) + s,
561 memset(o->hash_table.items, 0, s);
563 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564 f->header->data_hash_table_size = htole64(s);
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
576 /* We use a fixed size hash table for the fields as this
577 * number should grow very slowly only */
579 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580 r = journal_file_append_object(f,
581 OBJECT_FIELD_HASH_TABLE,
582 offsetof(Object, hash_table.items) + s,
587 memset(o->hash_table.items, 0, s);
589 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590 f->header->field_hash_table_size = htole64(s);
595 static int journal_file_map_data_hash_table(JournalFile *f) {
602 p = le64toh(f->header->data_hash_table_offset);
603 s = le64toh(f->header->data_hash_table_size);
605 r = journal_file_move_to(f,
606 OBJECT_DATA_HASH_TABLE,
613 f->data_hash_table = t;
617 static int journal_file_map_field_hash_table(JournalFile *f) {
624 p = le64toh(f->header->field_hash_table_offset);
625 s = le64toh(f->header->field_hash_table_size);
627 r = journal_file_move_to(f,
628 OBJECT_FIELD_HASH_TABLE,
635 f->field_hash_table = t;
639 static int journal_file_link_field(
652 if (o->object.type != OBJECT_FIELD)
655 /* This might alter the window we are looking at */
657 o->field.next_hash_offset = o->field.head_data_offset = 0;
659 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660 p = le64toh(f->field_hash_table[h].tail_hash_offset);
662 f->field_hash_table[h].head_hash_offset = htole64(offset);
664 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
668 o->field.next_hash_offset = htole64(offset);
671 f->field_hash_table[h].tail_hash_offset = htole64(offset);
673 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
679 static int journal_file_link_data(
692 if (o->object.type != OBJECT_DATA)
695 /* This might alter the window we are looking at */
697 o->data.next_hash_offset = o->data.next_field_offset = 0;
698 o->data.entry_offset = o->data.entry_array_offset = 0;
699 o->data.n_entries = 0;
701 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702 p = le64toh(f->data_hash_table[h].tail_hash_offset);
704 /* Only entry in the hash table is easy */
705 f->data_hash_table[h].head_hash_offset = htole64(offset);
707 /* Move back to the previous data object, to patch in
710 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
714 o->data.next_hash_offset = htole64(offset);
717 f->data_hash_table[h].tail_hash_offset = htole64(offset);
719 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
725 int journal_file_find_field_object_with_hash(
727 const void *field, uint64_t size, uint64_t hash,
728 Object **ret, uint64_t *offset) {
730 uint64_t p, osize, h;
734 assert(field && size > 0);
736 osize = offsetof(Object, field.payload) + size;
738 if (f->header->field_hash_table_size == 0)
741 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742 p = le64toh(f->field_hash_table[h].head_hash_offset);
747 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
751 if (le64toh(o->field.hash) == hash &&
752 le64toh(o->object.size) == osize &&
753 memcmp(o->field.payload, field, size) == 0) {
763 p = le64toh(o->field.next_hash_offset);
769 int journal_file_find_field_object(
771 const void *field, uint64_t size,
772 Object **ret, uint64_t *offset) {
777 assert(field && size > 0);
779 hash = hash64(field, size);
781 return journal_file_find_field_object_with_hash(f,
786 int journal_file_find_data_object_with_hash(
788 const void *data, uint64_t size, uint64_t hash,
789 Object **ret, uint64_t *offset) {
791 uint64_t p, osize, h;
795 assert(data || size == 0);
797 osize = offsetof(Object, data.payload) + size;
799 if (f->header->data_hash_table_size == 0)
802 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803 p = le64toh(f->data_hash_table[h].head_hash_offset);
808 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
812 if (le64toh(o->data.hash) != hash)
815 if (o->object.flags & OBJECT_COMPRESSED) {
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
823 l -= offsetof(Object, data.payload);
825 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
829 memcmp(f->compress_buffer, data, size) == 0) {
840 return -EPROTONOSUPPORT;
843 } else if (le64toh(o->object.size) == osize &&
844 memcmp(o->data.payload, data, size) == 0) {
856 p = le64toh(o->data.next_hash_offset);
862 int journal_file_find_data_object(
864 const void *data, uint64_t size,
865 Object **ret, uint64_t *offset) {
870 assert(data || size == 0);
872 hash = hash64(data, size);
874 return journal_file_find_data_object_with_hash(f,
879 static int journal_file_append_field(
881 const void *field, uint64_t size,
882 Object **ret, uint64_t *offset) {
890 assert(field && size > 0);
892 hash = hash64(field, size);
894 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
908 osize = offsetof(Object, field.payload) + size;
909 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
911 o->field.hash = htole64(hash);
912 memcpy(o->field.payload, field, size);
914 r = journal_file_link_field(f, o, p, hash);
918 /* The linking might have altered the window, so let's
919 * refresh our pointer */
920 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
925 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
939 static int journal_file_append_data(
941 const void *data, uint64_t size,
942 Object **ret, uint64_t *offset) {
948 bool compressed = false;
952 assert(data || size == 0);
954 hash = hash64(data, size);
956 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
970 osize = offsetof(Object, data.payload) + size;
971 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
975 o->data.hash = htole64(hash);
979 size >= COMPRESSION_SIZE_THRESHOLD) {
982 compressed = compress_blob(data, size, o->data.payload, &rsize);
985 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
986 o->object.flags |= OBJECT_COMPRESSED;
988 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
993 if (!compressed && size > 0)
994 memcpy(o->data.payload, data, size);
996 r = journal_file_link_data(f, o, p, hash);
1000 /* The linking might have altered the window, so let's
1001 * refresh our pointer */
1002 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1006 eq = memchr(data, '=', size);
1007 if (eq && eq > data) {
1011 /* Create field object ... */
1012 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1016 /* ... and link it in. */
1017 o->data.next_field_offset = fo->field.head_data_offset;
1018 fo->field.head_data_offset = le64toh(p);
1022 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1036 uint64_t journal_file_entry_n_items(Object *o) {
1039 if (o->object.type != OBJECT_ENTRY)
1042 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1045 uint64_t journal_file_entry_array_n_items(Object *o) {
1048 if (o->object.type != OBJECT_ENTRY_ARRAY)
1051 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1054 uint64_t journal_file_hash_table_n_items(Object *o) {
1057 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1058 o->object.type != OBJECT_FIELD_HASH_TABLE)
1061 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1064 static int link_entry_into_array(JournalFile *f,
1069 uint64_t n = 0, ap = 0, q, i, a, hidx;
1077 a = le64toh(*first);
1078 i = hidx = le64toh(*idx);
1081 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1085 n = journal_file_entry_array_n_items(o);
1087 o->entry_array.items[i] = htole64(p);
1088 *idx = htole64(hidx + 1);
1094 a = le64toh(o->entry_array.next_entry_array_offset);
1105 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1106 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1112 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1117 o->entry_array.items[i] = htole64(p);
1120 *first = htole64(q);
1122 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1126 o->entry_array.next_entry_array_offset = htole64(q);
1129 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1130 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1132 *idx = htole64(hidx + 1);
1137 static int link_entry_into_array_plus_one(JournalFile *f,
1152 *extra = htole64(p);
1156 i = htole64(le64toh(*idx) - 1);
1157 r = link_entry_into_array(f, first, &i, p);
1162 *idx = htole64(le64toh(*idx) + 1);
1166 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1173 p = le64toh(o->entry.items[i].object_offset);
1177 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1181 return link_entry_into_array_plus_one(f,
1182 &o->data.entry_offset,
1183 &o->data.entry_array_offset,
1188 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1196 if (o->object.type != OBJECT_ENTRY)
1199 __sync_synchronize();
1201 /* Link up the entry itself */
1202 r = link_entry_into_array(f,
1203 &f->header->entry_array_offset,
1204 &f->header->n_entries,
1209 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1211 if (f->header->head_entry_realtime == 0)
1212 f->header->head_entry_realtime = o->entry.realtime;
1214 f->header->tail_entry_realtime = o->entry.realtime;
1215 f->header->tail_entry_monotonic = o->entry.monotonic;
1217 f->tail_entry_monotonic_valid = true;
1219 /* Link up the items */
1220 n = journal_file_entry_n_items(o);
1221 for (i = 0; i < n; i++) {
1222 r = journal_file_link_entry_item(f, o, offset, i);
1230 static int journal_file_append_entry_internal(
1232 const dual_timestamp *ts,
1234 const EntryItem items[], unsigned n_items,
1236 Object **ret, uint64_t *offset) {
1243 assert(items || n_items == 0);
1246 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1248 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1252 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1253 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1254 o->entry.realtime = htole64(ts->realtime);
1255 o->entry.monotonic = htole64(ts->monotonic);
1256 o->entry.xor_hash = htole64(xor_hash);
1257 o->entry.boot_id = f->header->boot_id;
1260 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1265 r = journal_file_link_entry(f, o, np);
1278 void journal_file_post_change(JournalFile *f) {
1281 /* inotify() does not receive IN_MODIFY events from file
1282 * accesses done via mmap(). After each access we hence
1283 * trigger IN_MODIFY by truncating the journal file to its
1284 * current size which triggers IN_MODIFY. */
1286 __sync_synchronize();
1288 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1289 log_error("Failed to truncate file to its own size: %m");
1292 static int entry_item_cmp(const void *_a, const void *_b) {
1293 const EntryItem *a = _a, *b = _b;
1295 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1297 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1302 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1306 uint64_t xor_hash = 0;
1307 struct dual_timestamp _ts;
1310 assert(iovec || n_iovec == 0);
1313 dual_timestamp_get(&_ts);
1317 if (f->tail_entry_monotonic_valid &&
1318 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1322 r = journal_file_maybe_append_tag(f, ts->realtime);
1327 /* alloca() can't take 0, hence let's allocate at least one */
1328 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1330 for (i = 0; i < n_iovec; i++) {
1334 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1338 xor_hash ^= le64toh(o->data.hash);
1339 items[i].object_offset = htole64(p);
1340 items[i].hash = o->data.hash;
1343 /* Order by the position on disk, in order to improve seek
1344 * times for rotating media. */
1345 qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1347 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1349 journal_file_post_change(f);
1354 typedef struct ChainCacheItem {
1355 uint64_t first; /* the array at the begin of the chain */
1356 uint64_t array; /* the cached array */
1357 uint64_t begin; /* the first item in the cached array */
1358 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1361 static void chain_cache_put(
1370 /* If the chain item to cache for this chain is the
1371 * first one it's not worth caching anything */
1375 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1376 ci = hashmap_steal_first(h);
1378 ci = new(ChainCacheItem, 1);
1385 if (hashmap_put(h, &ci->first, ci) < 0) {
1390 assert(ci->first == first);
1397 static int generic_array_get(JournalFile *f,
1400 Object **ret, uint64_t *offset) {
1403 uint64_t p = 0, a, t = 0;
1411 /* Try the chain cache first */
1412 ci = hashmap_get(f->chain_cache, &first);
1413 if (ci && i > ci->total) {
1422 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1426 k = journal_file_entry_array_n_items(o);
1428 p = le64toh(o->entry_array.items[i]);
1434 a = le64toh(o->entry_array.next_entry_array_offset);
1440 /* Let's cache this item for the next invocation */
1441 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1443 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1456 static int generic_array_get_plus_one(JournalFile *f,
1460 Object **ret, uint64_t *offset) {
1469 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1482 return generic_array_get(f, first, i-1, ret, offset);
1491 static int generic_array_bisect(JournalFile *f,
1495 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1496 direction_t direction,
1501 uint64_t a, p, t = 0, i = 0, last_p = 0;
1502 bool subtract_one = false;
1503 Object *o, *array = NULL;
1508 assert(test_object);
1510 /* Start with the first array in the chain */
1513 ci = hashmap_get(f->chain_cache, &first);
1514 if (ci && n > ci->total) {
1515 /* Ah, we have iterated this bisection array chain
1516 * previously! Let's see if we can skip ahead in the
1517 * chain, as far as the last time. But we can't jump
1518 * backwards in the chain, so let's check that
1521 r = test_object(f, ci->begin, needle);
1525 if (r == TEST_LEFT) {
1526 /* OK, what we are looking for is right of th
1527 * begin of this EntryArray, so let's jump
1528 * straight to previously cached array in the
1538 uint64_t left, right, k, lp;
1540 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1544 k = journal_file_entry_array_n_items(array);
1550 lp = p = le64toh(array->entry_array.items[i]);
1554 r = test_object(f, p, needle);
1558 if (r == TEST_FOUND)
1559 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1561 if (r == TEST_RIGHT) {
1565 if (left == right) {
1566 if (direction == DIRECTION_UP)
1567 subtract_one = true;
1573 assert(left < right);
1575 i = (left + right) / 2;
1576 p = le64toh(array->entry_array.items[i]);
1580 r = test_object(f, p, needle);
1584 if (r == TEST_FOUND)
1585 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1587 if (r == TEST_RIGHT)
1595 if (direction == DIRECTION_UP) {
1597 subtract_one = true;
1608 a = le64toh(array->entry_array.next_entry_array_offset);
1614 if (subtract_one && t == 0 && i == 0)
1617 /* Let's cache this item for the next invocation */
1618 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1620 if (subtract_one && i == 0)
1622 else if (subtract_one)
1623 p = le64toh(array->entry_array.items[i-1]);
1625 p = le64toh(array->entry_array.items[i]);
1627 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1638 *idx = t + i + (subtract_one ? -1 : 0);
1643 static int generic_array_bisect_plus_one(JournalFile *f,
1648 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1649 direction_t direction,
1655 bool step_back = false;
1659 assert(test_object);
1664 /* This bisects the array in object 'first', but first checks
1666 r = test_object(f, extra, needle);
1670 if (r == TEST_FOUND)
1671 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1673 /* if we are looking with DIRECTION_UP then we need to first
1674 see if in the actual array there is a matching entry, and
1675 return the last one of that. But if there isn't any we need
1676 to return this one. Hence remember this, and return it
1679 step_back = direction == DIRECTION_UP;
1681 if (r == TEST_RIGHT) {
1682 if (direction == DIRECTION_DOWN)
1688 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1690 if (r == 0 && step_back)
1699 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1715 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1721 else if (p < needle)
1727 int journal_file_move_to_entry_by_offset(
1730 direction_t direction,
1734 return generic_array_bisect(f,
1735 le64toh(f->header->entry_array_offset),
1736 le64toh(f->header->n_entries),
1744 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1751 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1755 if (le64toh(o->entry.seqnum) == needle)
1757 else if (le64toh(o->entry.seqnum) < needle)
1763 int journal_file_move_to_entry_by_seqnum(
1766 direction_t direction,
1770 return generic_array_bisect(f,
1771 le64toh(f->header->entry_array_offset),
1772 le64toh(f->header->n_entries),
1779 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1786 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1790 if (le64toh(o->entry.realtime) == needle)
1792 else if (le64toh(o->entry.realtime) < needle)
1798 int journal_file_move_to_entry_by_realtime(
1801 direction_t direction,
1805 return generic_array_bisect(f,
1806 le64toh(f->header->entry_array_offset),
1807 le64toh(f->header->n_entries),
1809 test_object_realtime,
1814 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1821 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1825 if (le64toh(o->entry.monotonic) == needle)
1827 else if (le64toh(o->entry.monotonic) < needle)
1833 static inline int find_data_object_by_boot_id(
1838 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1840 sd_id128_to_string(boot_id, t + 9);
1841 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1844 int journal_file_move_to_entry_by_monotonic(
1848 direction_t direction,
1857 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1863 return generic_array_bisect_plus_one(f,
1864 le64toh(o->data.entry_offset),
1865 le64toh(o->data.entry_array_offset),
1866 le64toh(o->data.n_entries),
1868 test_object_monotonic,
1873 int journal_file_next_entry(
1875 Object *o, uint64_t p,
1876 direction_t direction,
1877 Object **ret, uint64_t *offset) {
1883 assert(p > 0 || !o);
1885 n = le64toh(f->header->n_entries);
1890 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1892 if (o->object.type != OBJECT_ENTRY)
1895 r = generic_array_bisect(f,
1896 le64toh(f->header->entry_array_offset),
1897 le64toh(f->header->n_entries),
1906 if (direction == DIRECTION_DOWN) {
1919 /* And jump to it */
1920 return generic_array_get(f,
1921 le64toh(f->header->entry_array_offset),
1926 int journal_file_skip_entry(
1928 Object *o, uint64_t p,
1930 Object **ret, uint64_t *offset) {
1939 if (o->object.type != OBJECT_ENTRY)
1942 r = generic_array_bisect(f,
1943 le64toh(f->header->entry_array_offset),
1944 le64toh(f->header->n_entries),
1953 /* Calculate new index */
1955 if ((uint64_t) -skip >= i)
1958 i = i - (uint64_t) -skip;
1960 i += (uint64_t) skip;
1962 n = le64toh(f->header->n_entries);
1969 return generic_array_get(f,
1970 le64toh(f->header->entry_array_offset),
1975 int journal_file_next_entry_for_data(
1977 Object *o, uint64_t p,
1978 uint64_t data_offset,
1979 direction_t direction,
1980 Object **ret, uint64_t *offset) {
1987 assert(p > 0 || !o);
1989 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1993 n = le64toh(d->data.n_entries);
1998 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2000 if (o->object.type != OBJECT_ENTRY)
2003 r = generic_array_bisect_plus_one(f,
2004 le64toh(d->data.entry_offset),
2005 le64toh(d->data.entry_array_offset),
2006 le64toh(d->data.n_entries),
2016 if (direction == DIRECTION_DOWN) {
2030 return generic_array_get_plus_one(f,
2031 le64toh(d->data.entry_offset),
2032 le64toh(d->data.entry_array_offset),
2037 int journal_file_move_to_entry_by_offset_for_data(
2039 uint64_t data_offset,
2041 direction_t direction,
2042 Object **ret, uint64_t *offset) {
2049 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2053 return generic_array_bisect_plus_one(f,
2054 le64toh(d->data.entry_offset),
2055 le64toh(d->data.entry_array_offset),
2056 le64toh(d->data.n_entries),
2063 int journal_file_move_to_entry_by_monotonic_for_data(
2065 uint64_t data_offset,
2068 direction_t direction,
2069 Object **ret, uint64_t *offset) {
2077 /* First, seek by time */
2078 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2084 r = generic_array_bisect_plus_one(f,
2085 le64toh(o->data.entry_offset),
2086 le64toh(o->data.entry_array_offset),
2087 le64toh(o->data.n_entries),
2089 test_object_monotonic,
2095 /* And now, continue seeking until we find an entry that
2096 * exists in both bisection arrays */
2102 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2106 r = generic_array_bisect_plus_one(f,
2107 le64toh(d->data.entry_offset),
2108 le64toh(d->data.entry_array_offset),
2109 le64toh(d->data.n_entries),
2117 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2121 r = generic_array_bisect_plus_one(f,
2122 le64toh(o->data.entry_offset),
2123 le64toh(o->data.entry_array_offset),
2124 le64toh(o->data.n_entries),
2148 int journal_file_move_to_entry_by_seqnum_for_data(
2150 uint64_t data_offset,
2152 direction_t direction,
2153 Object **ret, uint64_t *offset) {
2160 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2164 return generic_array_bisect_plus_one(f,
2165 le64toh(d->data.entry_offset),
2166 le64toh(d->data.entry_array_offset),
2167 le64toh(d->data.n_entries),
2174 int journal_file_move_to_entry_by_realtime_for_data(
2176 uint64_t data_offset,
2178 direction_t direction,
2179 Object **ret, uint64_t *offset) {
2186 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2190 return generic_array_bisect_plus_one(f,
2191 le64toh(d->data.entry_offset),
2192 le64toh(d->data.entry_array_offset),
2193 le64toh(d->data.n_entries),
2195 test_object_realtime,
2200 void journal_file_dump(JournalFile *f) {
2207 journal_file_print_header(f);
2209 p = le64toh(f->header->header_size);
2211 r = journal_file_move_to_object(f, -1, p, &o);
2215 switch (o->object.type) {
2218 printf("Type: OBJECT_UNUSED\n");
2222 printf("Type: OBJECT_DATA\n");
2226 printf("Type: OBJECT_FIELD\n");
2230 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2231 le64toh(o->entry.seqnum),
2232 le64toh(o->entry.monotonic),
2233 le64toh(o->entry.realtime));
2236 case OBJECT_FIELD_HASH_TABLE:
2237 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2240 case OBJECT_DATA_HASH_TABLE:
2241 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2244 case OBJECT_ENTRY_ARRAY:
2245 printf("Type: OBJECT_ENTRY_ARRAY\n");
2249 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2250 le64toh(o->tag.seqnum),
2251 le64toh(o->tag.epoch));
2255 printf("Type: unknown (%u)\n", o->object.type);
2259 if (o->object.flags & OBJECT_COMPRESSED)
2260 printf("Flags: COMPRESSED\n");
2262 if (p == le64toh(f->header->tail_object_offset))
2265 p = p + ALIGN64(le64toh(o->object.size));
2270 log_error("File corrupt");
2273 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2276 x = format_timestamp(buf, l, t);
2282 void journal_file_print_header(JournalFile *f) {
2283 char a[33], b[33], c[33], d[33];
2284 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2286 char bytes[FORMAT_BYTES_MAX];
2290 printf("File Path: %s\n"
2294 "Sequential Number ID: %s\n"
2296 "Compatible Flags:%s%s\n"
2297 "Incompatible Flags:%s%s\n"
2298 "Header size: %"PRIu64"\n"
2299 "Arena size: %"PRIu64"\n"
2300 "Data Hash Table Size: %"PRIu64"\n"
2301 "Field Hash Table Size: %"PRIu64"\n"
2302 "Rotate Suggested: %s\n"
2303 "Head Sequential Number: %"PRIu64"\n"
2304 "Tail Sequential Number: %"PRIu64"\n"
2305 "Head Realtime Timestamp: %s\n"
2306 "Tail Realtime Timestamp: %s\n"
2307 "Tail Monotonic Timestamp: %s\n"
2308 "Objects: %"PRIu64"\n"
2309 "Entry Objects: %"PRIu64"\n",
2311 sd_id128_to_string(f->header->file_id, a),
2312 sd_id128_to_string(f->header->machine_id, b),
2313 sd_id128_to_string(f->header->boot_id, c),
2314 sd_id128_to_string(f->header->seqnum_id, d),
2315 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2316 f->header->state == STATE_ONLINE ? "ONLINE" :
2317 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2318 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2319 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2320 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2321 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2322 le64toh(f->header->header_size),
2323 le64toh(f->header->arena_size),
2324 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2325 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2326 yes_no(journal_file_rotate_suggested(f, 0)),
2327 le64toh(f->header->head_entry_seqnum),
2328 le64toh(f->header->tail_entry_seqnum),
2329 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2330 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2331 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2332 le64toh(f->header->n_objects),
2333 le64toh(f->header->n_entries));
2335 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2336 printf("Data Objects: %"PRIu64"\n"
2337 "Data Hash Table Fill: %.1f%%\n",
2338 le64toh(f->header->n_data),
2339 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2341 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2342 printf("Field Objects: %"PRIu64"\n"
2343 "Field Hash Table Fill: %.1f%%\n",
2344 le64toh(f->header->n_fields),
2345 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2347 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2348 printf("Tag Objects: %"PRIu64"\n",
2349 le64toh(f->header->n_tags));
2350 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2351 printf("Entry Array Objects: %"PRIu64"\n",
2352 le64toh(f->header->n_entry_arrays));
2354 if (fstat(f->fd, &st) >= 0)
2355 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2358 int journal_file_open(
2364 JournalMetrics *metrics,
2365 MMapCache *mmap_cache,
2366 JournalFile *template,
2367 JournalFile **ret) {
2371 bool newly_created = false;
2376 if ((flags & O_ACCMODE) != O_RDONLY &&
2377 (flags & O_ACCMODE) != O_RDWR)
2380 if (!endswith(fname, ".journal") &&
2381 !endswith(fname, ".journal~"))
2384 f = new0(JournalFile, 1);
2392 f->prot = prot_from_flags(flags);
2393 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2395 f->compress = compress;
2402 f->mmap = mmap_cache_ref(mmap_cache);
2404 f->mmap = mmap_cache_new();
2411 f->path = strdup(fname);
2417 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2418 if (!f->chain_cache) {
2423 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2429 if (fstat(f->fd, &f->last_stat) < 0) {
2434 if (f->last_stat.st_size == 0 && f->writable) {
2438 /* Let's attach the creation time to the journal file,
2439 * so that the vacuuming code knows the age of this
2440 * file even if the file might end up corrupted one
2441 * day... Ideally we'd just use the creation time many
2442 * file systems maintain for each file, but there is
2443 * currently no usable API to query this, hence let's
2444 * emulate this via extended attributes. If extended
2445 * attributes are not supported we'll just skip this,
2446 * and rely solely on mtime/atime/ctime of the file.*/
2448 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2449 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2453 /* Try to load the FSPRG state, and if we can't, then
2454 * just don't do sealing */
2456 r = journal_file_fss_load(f);
2462 r = journal_file_init_header(f, template);
2466 if (fstat(f->fd, &f->last_stat) < 0) {
2471 newly_created = true;
2474 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2479 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2480 if (f->header == MAP_FAILED) {
2486 if (!newly_created) {
2487 r = journal_file_verify_header(f);
2493 if (!newly_created && f->writable) {
2494 r = journal_file_fss_load(f);
2502 journal_default_metrics(metrics, f->fd);
2503 f->metrics = *metrics;
2504 } else if (template)
2505 f->metrics = template->metrics;
2507 r = journal_file_refresh_header(f);
2513 r = journal_file_hmac_setup(f);
2518 if (newly_created) {
2519 r = journal_file_setup_field_hash_table(f);
2523 r = journal_file_setup_data_hash_table(f);
2528 r = journal_file_append_first_tag(f);
2534 r = journal_file_map_field_hash_table(f);
2538 r = journal_file_map_data_hash_table(f);
2546 journal_file_close(f);
2551 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2554 JournalFile *old_file, *new_file = NULL;
2562 if (!old_file->writable)
2565 if (!endswith(old_file->path, ".journal"))
2568 l = strlen(old_file->path);
2570 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2574 memcpy(p, old_file->path, l - 8);
2576 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2577 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2578 "-%016"PRIx64"-%016"PRIx64".journal",
2579 le64toh((*f)->header->head_entry_seqnum),
2580 le64toh((*f)->header->head_entry_realtime));
2582 r = rename(old_file->path, p);
2588 old_file->header->state = STATE_ARCHIVED;
2590 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2591 journal_file_close(old_file);
2597 int journal_file_open_reliably(
2603 JournalMetrics *metrics,
2604 MMapCache *mmap_cache,
2605 JournalFile *template,
2606 JournalFile **ret) {
2610 _cleanup_free_ char *p = NULL;
2612 r = journal_file_open(fname, flags, mode, compress, seal,
2613 metrics, mmap_cache, template, ret);
2614 if (r != -EBADMSG && /* corrupted */
2615 r != -ENODATA && /* truncated */
2616 r != -EHOSTDOWN && /* other machine */
2617 r != -EPROTONOSUPPORT && /* incompatible feature */
2618 r != -EBUSY && /* unclean shutdown */
2619 r != -ESHUTDOWN /* already archived */)
2622 if ((flags & O_ACCMODE) == O_RDONLY)
2625 if (!(flags & O_CREAT))
2628 if (!endswith(fname, ".journal"))
2631 /* The file is corrupted. Rotate it away and try it again (but only once) */
2634 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2636 (unsigned long long) now(CLOCK_REALTIME),
2640 r = rename(fname, p);
2644 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2646 return journal_file_open(fname, flags, mode, compress, seal,
2647 metrics, mmap_cache, template, ret);
2650 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2652 uint64_t q, xor_hash = 0;
2665 ts.monotonic = le64toh(o->entry.monotonic);
2666 ts.realtime = le64toh(o->entry.realtime);
2668 if (to->tail_entry_monotonic_valid &&
2669 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2672 n = journal_file_entry_n_items(o);
2673 items = alloca(sizeof(EntryItem) * n);
2675 for (i = 0; i < n; i++) {
2682 q = le64toh(o->entry.items[i].object_offset);
2683 le_hash = o->entry.items[i].hash;
2685 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2689 if (le_hash != o->data.hash)
2692 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2695 /* We hit the limit on 32bit machines */
2696 if ((uint64_t) t != l)
2699 if (o->object.flags & OBJECT_COMPRESSED) {
2703 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2706 data = from->compress_buffer;
2709 return -EPROTONOSUPPORT;
2712 data = o->data.payload;
2714 r = journal_file_append_data(to, data, l, &u, &h);
2718 xor_hash ^= le64toh(u->data.hash);
2719 items[i].object_offset = htole64(h);
2720 items[i].hash = u->data.hash;
2722 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2727 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2730 void journal_default_metrics(JournalMetrics *m, int fd) {
2731 uint64_t fs_size = 0;
2733 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2738 if (fstatvfs(fd, &ss) >= 0)
2739 fs_size = ss.f_frsize * ss.f_blocks;
2741 if (m->max_use == (uint64_t) -1) {
2744 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2746 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2747 m->max_use = DEFAULT_MAX_USE_UPPER;
2749 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2750 m->max_use = DEFAULT_MAX_USE_LOWER;
2752 m->max_use = DEFAULT_MAX_USE_LOWER;
2754 m->max_use = PAGE_ALIGN(m->max_use);
2756 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2757 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2760 if (m->max_size == (uint64_t) -1) {
2761 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2763 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2764 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2766 m->max_size = PAGE_ALIGN(m->max_size);
2768 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2769 m->max_size = JOURNAL_FILE_SIZE_MIN;
2771 if (m->max_size*2 > m->max_use)
2772 m->max_use = m->max_size*2;
2774 if (m->min_size == (uint64_t) -1)
2775 m->min_size = JOURNAL_FILE_SIZE_MIN;
2777 m->min_size = PAGE_ALIGN(m->min_size);
2779 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2780 m->min_size = JOURNAL_FILE_SIZE_MIN;
2782 if (m->min_size > m->max_size)
2783 m->max_size = m->min_size;
2786 if (m->keep_free == (uint64_t) -1) {
2789 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2791 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2792 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2795 m->keep_free = DEFAULT_KEEP_FREE;
2798 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2799 format_bytes(a, sizeof(a), m->max_use),
2800 format_bytes(b, sizeof(b), m->max_size),
2801 format_bytes(c, sizeof(c), m->min_size),
2802 format_bytes(d, sizeof(d), m->keep_free));
2805 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2810 if (f->header->head_entry_realtime == 0)
2813 *from = le64toh(f->header->head_entry_realtime);
2817 if (f->header->tail_entry_realtime == 0)
2820 *to = le64toh(f->header->tail_entry_realtime);
2826 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2834 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2838 if (le64toh(o->data.n_entries) <= 0)
2842 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2846 *from = le64toh(o->entry.monotonic);
2850 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2854 r = generic_array_get_plus_one(f,
2855 le64toh(o->data.entry_offset),
2856 le64toh(o->data.entry_array_offset),
2857 le64toh(o->data.n_entries)-1,
2862 *to = le64toh(o->entry.monotonic);
2868 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2871 /* If we gained new header fields we gained new features,
2872 * hence suggest a rotation */
2873 if (le64toh(f->header->header_size) < sizeof(Header)) {
2874 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2878 /* Let's check if the hash tables grew over a certain fill
2879 * level (75%, borrowing this value from Java's hash table
2880 * implementation), and if so suggest a rotation. To calculate
2881 * the fill level we need the n_data field, which only exists
2882 * in newer versions. */
2884 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2885 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2886 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2888 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2889 le64toh(f->header->n_data),
2890 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2891 (unsigned long long) f->last_stat.st_size,
2892 f->last_stat.st_size / le64toh(f->header->n_data));
2896 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2897 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2898 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2900 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2901 le64toh(f->header->n_fields),
2902 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2906 /* Are the data objects properly indexed by field objects? */
2907 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2908 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2909 le64toh(f->header->n_data) > 0 &&
2910 le64toh(f->header->n_fields) == 0)
2913 if (max_file_usec > 0) {
2916 h = le64toh(f->header->head_entry_realtime);
2917 t = now(CLOCK_REALTIME);
2919 if (h > 0 && t > h + max_file_usec)