1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 static int journal_file_set_online(JournalFile *f) {
77 if (!(f->fd >= 0 && f->header))
80 switch(f->header->state) {
85 f->header->state = STATE_ONLINE;
94 int journal_file_set_offline(JournalFile *f) {
100 if (!(f->fd >= 0 && f->header))
103 if (f->header->state != STATE_ONLINE)
108 f->header->state = STATE_OFFLINE;
115 void journal_file_close(JournalFile *f) {
119 /* Write the final tag */
120 if (f->seal && f->writable)
121 journal_file_append_tag(f);
124 /* Sync everything to disk, before we mark the file offline */
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
128 journal_file_set_offline(f);
131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
134 close_nointr_nofail(f->fd);
139 mmap_cache_unref(f->mmap);
141 hashmap_free_free(f->chain_cache);
144 free(f->compress_buffer);
149 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
150 else if (f->fsprg_state)
151 free(f->fsprg_state);
156 gcry_md_close(f->hmac);
162 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
170 memcpy(h.signature, HEADER_SIGNATURE, 8);
171 h.header_size = htole64(ALIGN64(sizeof(h)));
173 h.incompatible_flags =
174 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
177 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
179 r = sd_id128_randomize(&h.file_id);
184 h.seqnum_id = template->header->seqnum_id;
185 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
187 h.seqnum_id = h.file_id;
189 k = pwrite(f->fd, &h, sizeof(h), 0);
199 static int journal_file_refresh_header(JournalFile *f) {
205 r = sd_id128_get_machine(&f->header->machine_id);
209 r = sd_id128_get_boot(&boot_id);
213 if (sd_id128_equal(boot_id, f->header->boot_id))
214 f->tail_entry_monotonic_valid = true;
216 f->header->boot_id = boot_id;
218 journal_file_set_online(f);
220 /* Sync the online state to disk */
221 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
227 static int journal_file_verify_header(JournalFile *f) {
230 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
233 /* In both read and write mode we refuse to open files with
234 * incompatible flags we don't know */
236 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
237 return -EPROTONOSUPPORT;
239 if (f->header->incompatible_flags != 0)
240 return -EPROTONOSUPPORT;
243 /* When open for writing we refuse to open files with
244 * compatible flags, too */
247 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
248 return -EPROTONOSUPPORT;
250 if (f->header->compatible_flags != 0)
251 return -EPROTONOSUPPORT;
255 if (f->header->state >= _STATE_MAX)
258 /* The first addition was n_data, so check that we are at least this large */
259 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
262 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
265 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
272 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
273 !VALID64(le64toh(f->header->tail_object_offset)) ||
274 !VALID64(le64toh(f->header->entry_array_offset)))
277 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
278 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
279 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
285 sd_id128_t machine_id;
288 r = sd_id128_get_machine(&machine_id);
292 if (!sd_id128_equal(machine_id, f->header->machine_id))
295 state = f->header->state;
297 if (state == STATE_ONLINE) {
298 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
300 } else if (state == STATE_ARCHIVED)
302 else if (state != STATE_OFFLINE) {
303 log_debug("Journal file %s has unknown state %u.", f->path, state);
308 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
310 f->seal = JOURNAL_HEADER_SEALED(f->header);
315 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
316 uint64_t old_size, new_size;
321 /* We assume that this file is not sparse, and we know that
322 * for sure, since we always call posix_fallocate()
326 le64toh(f->header->header_size) +
327 le64toh(f->header->arena_size);
329 new_size = PAGE_ALIGN(offset + size);
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
333 if (new_size <= old_size)
336 if (f->metrics.max_size > 0 &&
337 new_size > f->metrics.max_size)
340 if (new_size > f->metrics.min_size &&
341 f->metrics.keep_free > 0) {
344 if (fstatvfs(f->fd, &svfs) >= 0) {
347 available = svfs.f_bfree * svfs.f_bsize;
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
354 if (new_size - old_size > available)
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (fstat(f->fd, &f->last_stat) < 0)
369 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
374 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
381 /* Avoid SIGBUS on invalid accesses */
382 if (offset + size > (uint64_t) f->last_stat.st_size) {
383 /* Hmm, out of range? Let's refresh the fstat() data
384 * first, before we trust that check. */
386 if (fstat(f->fd, &f->last_stat) < 0 ||
387 offset + size > (uint64_t) f->last_stat.st_size)
388 return -EADDRNOTAVAIL;
391 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
394 static uint64_t minimum_header_size(Object *o) {
396 static uint64_t table[] = {
397 [OBJECT_DATA] = sizeof(DataObject),
398 [OBJECT_FIELD] = sizeof(FieldObject),
399 [OBJECT_ENTRY] = sizeof(EntryObject),
400 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403 [OBJECT_TAG] = sizeof(TagObject),
406 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407 return sizeof(ObjectHeader);
409 return table[o->object.type];
412 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
422 /* Objects may only be located at multiple of 64 bit */
423 if (!VALID64(offset))
426 /* One context for each type, plus one catch-all for the rest */
427 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
429 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
434 s = le64toh(o->object.size);
436 if (s < sizeof(ObjectHeader))
439 if (o->object.type <= OBJECT_UNUSED)
442 if (s < minimum_header_size(o))
445 if (type > 0 && o->object.type != type)
448 if (s > sizeof(ObjectHeader)) {
449 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
460 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
465 r = le64toh(f->header->tail_entry_seqnum) + 1;
468 /* If an external seqnum counter was passed, we update
469 * both the local and the external one, and set it to
470 * the maximum of both */
478 f->header->tail_entry_seqnum = htole64(r);
480 if (f->header->head_entry_seqnum == 0)
481 f->header->head_entry_seqnum = htole64(r);
486 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
493 assert(type > 0 && type < _OBJECT_TYPE_MAX);
494 assert(size >= sizeof(ObjectHeader));
498 r = journal_file_set_online(f);
502 p = le64toh(f->header->tail_object_offset);
504 p = le64toh(f->header->header_size);
506 r = journal_file_move_to_object(f, -1, p, &tail);
510 p += ALIGN64(le64toh(tail->object.size));
513 r = journal_file_allocate(f, p, size);
517 r = journal_file_move_to(f, type, false, p, size, &t);
524 o->object.type = type;
525 o->object.size = htole64(size);
527 f->header->tail_object_offset = htole64(p);
528 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
536 static int journal_file_setup_data_hash_table(JournalFile *f) {
543 /* We estimate that we need 1 hash table entry per 768 of
544 journal file and we want to make sure we never get beyond
545 75% fill level. Calculate the hash table size for the
546 maximum file size based on these metrics. */
548 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
549 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
550 s = DEFAULT_DATA_HASH_TABLE_SIZE;
552 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
554 r = journal_file_append_object(f,
555 OBJECT_DATA_HASH_TABLE,
556 offsetof(Object, hash_table.items) + s,
561 memset(o->hash_table.items, 0, s);
563 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
564 f->header->data_hash_table_size = htole64(s);
569 static int journal_file_setup_field_hash_table(JournalFile *f) {
576 /* We use a fixed size hash table for the fields as this
577 * number should grow very slowly only */
579 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
580 r = journal_file_append_object(f,
581 OBJECT_FIELD_HASH_TABLE,
582 offsetof(Object, hash_table.items) + s,
587 memset(o->hash_table.items, 0, s);
589 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
590 f->header->field_hash_table_size = htole64(s);
595 static int journal_file_map_data_hash_table(JournalFile *f) {
602 p = le64toh(f->header->data_hash_table_offset);
603 s = le64toh(f->header->data_hash_table_size);
605 r = journal_file_move_to(f,
606 OBJECT_DATA_HASH_TABLE,
613 f->data_hash_table = t;
617 static int journal_file_map_field_hash_table(JournalFile *f) {
624 p = le64toh(f->header->field_hash_table_offset);
625 s = le64toh(f->header->field_hash_table_size);
627 r = journal_file_move_to(f,
628 OBJECT_FIELD_HASH_TABLE,
635 f->field_hash_table = t;
639 static int journal_file_link_field(
652 if (o->object.type != OBJECT_FIELD)
655 /* This might alter the window we are looking at */
657 o->field.next_hash_offset = o->field.head_data_offset = 0;
659 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
660 p = le64toh(f->field_hash_table[h].tail_hash_offset);
662 f->field_hash_table[h].head_hash_offset = htole64(offset);
664 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
668 o->field.next_hash_offset = htole64(offset);
671 f->field_hash_table[h].tail_hash_offset = htole64(offset);
673 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
674 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
679 static int journal_file_link_data(
692 if (o->object.type != OBJECT_DATA)
695 /* This might alter the window we are looking at */
697 o->data.next_hash_offset = o->data.next_field_offset = 0;
698 o->data.entry_offset = o->data.entry_array_offset = 0;
699 o->data.n_entries = 0;
701 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
702 p = le64toh(f->data_hash_table[h].tail_hash_offset);
704 /* Only entry in the hash table is easy */
705 f->data_hash_table[h].head_hash_offset = htole64(offset);
707 /* Move back to the previous data object, to patch in
710 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
714 o->data.next_hash_offset = htole64(offset);
717 f->data_hash_table[h].tail_hash_offset = htole64(offset);
719 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
720 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
725 int journal_file_find_field_object_with_hash(
727 const void *field, uint64_t size, uint64_t hash,
728 Object **ret, uint64_t *offset) {
730 uint64_t p, osize, h;
734 assert(field && size > 0);
736 osize = offsetof(Object, field.payload) + size;
738 if (f->header->field_hash_table_size == 0)
741 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
742 p = le64toh(f->field_hash_table[h].head_hash_offset);
747 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
751 if (le64toh(o->field.hash) == hash &&
752 le64toh(o->object.size) == osize &&
753 memcmp(o->field.payload, field, size) == 0) {
763 p = le64toh(o->field.next_hash_offset);
769 int journal_file_find_field_object(
771 const void *field, uint64_t size,
772 Object **ret, uint64_t *offset) {
777 assert(field && size > 0);
779 hash = hash64(field, size);
781 return journal_file_find_field_object_with_hash(f,
786 int journal_file_find_data_object_with_hash(
788 const void *data, uint64_t size, uint64_t hash,
789 Object **ret, uint64_t *offset) {
791 uint64_t p, osize, h;
795 assert(data || size == 0);
797 osize = offsetof(Object, data.payload) + size;
799 if (f->header->data_hash_table_size == 0)
802 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
803 p = le64toh(f->data_hash_table[h].head_hash_offset);
808 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
812 if (le64toh(o->data.hash) != hash)
815 if (o->object.flags & OBJECT_COMPRESSED) {
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
823 l -= offsetof(Object, data.payload);
825 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
829 memcmp(f->compress_buffer, data, size) == 0) {
840 return -EPROTONOSUPPORT;
843 } else if (le64toh(o->object.size) == osize &&
844 memcmp(o->data.payload, data, size) == 0) {
856 p = le64toh(o->data.next_hash_offset);
862 int journal_file_find_data_object(
864 const void *data, uint64_t size,
865 Object **ret, uint64_t *offset) {
870 assert(data || size == 0);
872 hash = hash64(data, size);
874 return journal_file_find_data_object_with_hash(f,
879 static int journal_file_append_field(
881 const void *field, uint64_t size,
882 Object **ret, uint64_t *offset) {
890 assert(field && size > 0);
892 hash = hash64(field, size);
894 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
908 osize = offsetof(Object, field.payload) + size;
909 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
913 o->field.hash = htole64(hash);
914 memcpy(o->field.payload, field, size);
916 r = journal_file_link_field(f, o, p, hash);
920 /* The linking might have altered the window, so let's
921 * refresh our pointer */
922 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
927 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
941 static int journal_file_append_data(
943 const void *data, uint64_t size,
944 Object **ret, uint64_t *offset) {
950 bool compressed = false;
954 assert(data || size == 0);
956 hash = hash64(data, size);
958 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
972 osize = offsetof(Object, data.payload) + size;
973 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
977 o->data.hash = htole64(hash);
981 size >= COMPRESSION_SIZE_THRESHOLD) {
984 compressed = compress_blob(data, size, o->data.payload, &rsize);
987 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
988 o->object.flags |= OBJECT_COMPRESSED;
990 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
995 if (!compressed && size > 0)
996 memcpy(o->data.payload, data, size);
998 r = journal_file_link_data(f, o, p, hash);
1002 /* The linking might have altered the window, so let's
1003 * refresh our pointer */
1004 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1008 eq = memchr(data, '=', size);
1009 if (eq && eq > data) {
1013 /* Create field object ... */
1014 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1018 /* ... and link it in. */
1019 o->data.next_field_offset = fo->field.head_data_offset;
1020 fo->field.head_data_offset = le64toh(p);
1024 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1038 uint64_t journal_file_entry_n_items(Object *o) {
1041 if (o->object.type != OBJECT_ENTRY)
1044 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1047 uint64_t journal_file_entry_array_n_items(Object *o) {
1050 if (o->object.type != OBJECT_ENTRY_ARRAY)
1053 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1056 uint64_t journal_file_hash_table_n_items(Object *o) {
1059 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1060 o->object.type != OBJECT_FIELD_HASH_TABLE)
1063 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1066 static int link_entry_into_array(JournalFile *f,
1071 uint64_t n = 0, ap = 0, q, i, a, hidx;
1079 a = le64toh(*first);
1080 i = hidx = le64toh(*idx);
1083 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1087 n = journal_file_entry_array_n_items(o);
1089 o->entry_array.items[i] = htole64(p);
1090 *idx = htole64(hidx + 1);
1096 a = le64toh(o->entry_array.next_entry_array_offset);
1107 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1108 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1114 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1119 o->entry_array.items[i] = htole64(p);
1122 *first = htole64(q);
1124 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1128 o->entry_array.next_entry_array_offset = htole64(q);
1131 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1132 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1134 *idx = htole64(hidx + 1);
1139 static int link_entry_into_array_plus_one(JournalFile *f,
1154 *extra = htole64(p);
1158 i = htole64(le64toh(*idx) - 1);
1159 r = link_entry_into_array(f, first, &i, p);
1164 *idx = htole64(le64toh(*idx) + 1);
1168 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1175 p = le64toh(o->entry.items[i].object_offset);
1179 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1183 return link_entry_into_array_plus_one(f,
1184 &o->data.entry_offset,
1185 &o->data.entry_array_offset,
1190 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1198 if (o->object.type != OBJECT_ENTRY)
1201 __sync_synchronize();
1203 /* Link up the entry itself */
1204 r = link_entry_into_array(f,
1205 &f->header->entry_array_offset,
1206 &f->header->n_entries,
1211 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1213 if (f->header->head_entry_realtime == 0)
1214 f->header->head_entry_realtime = o->entry.realtime;
1216 f->header->tail_entry_realtime = o->entry.realtime;
1217 f->header->tail_entry_monotonic = o->entry.monotonic;
1219 f->tail_entry_monotonic_valid = true;
1221 /* Link up the items */
1222 n = journal_file_entry_n_items(o);
1223 for (i = 0; i < n; i++) {
1224 r = journal_file_link_entry_item(f, o, offset, i);
1232 static int journal_file_append_entry_internal(
1234 const dual_timestamp *ts,
1236 const EntryItem items[], unsigned n_items,
1238 Object **ret, uint64_t *offset) {
1245 assert(items || n_items == 0);
1248 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1250 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1254 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1255 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1256 o->entry.realtime = htole64(ts->realtime);
1257 o->entry.monotonic = htole64(ts->monotonic);
1258 o->entry.xor_hash = htole64(xor_hash);
1259 o->entry.boot_id = f->header->boot_id;
1262 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1267 r = journal_file_link_entry(f, o, np);
1280 void journal_file_post_change(JournalFile *f) {
1283 /* inotify() does not receive IN_MODIFY events from file
1284 * accesses done via mmap(). After each access we hence
1285 * trigger IN_MODIFY by truncating the journal file to its
1286 * current size which triggers IN_MODIFY. */
1288 __sync_synchronize();
1290 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1291 log_error("Failed to truncate file to its own size: %m");
1294 static int entry_item_cmp(const void *_a, const void *_b) {
1295 const EntryItem *a = _a, *b = _b;
1297 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1299 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1304 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1308 uint64_t xor_hash = 0;
1309 struct dual_timestamp _ts;
1312 assert(iovec || n_iovec == 0);
1315 dual_timestamp_get(&_ts);
1319 if (f->tail_entry_monotonic_valid &&
1320 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1324 r = journal_file_maybe_append_tag(f, ts->realtime);
1329 /* alloca() can't take 0, hence let's allocate at least one */
1330 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
1332 for (i = 0; i < n_iovec; i++) {
1336 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1340 xor_hash ^= le64toh(o->data.hash);
1341 items[i].object_offset = htole64(p);
1342 items[i].hash = o->data.hash;
1345 /* Order by the position on disk, in order to improve seek
1346 * times for rotating media. */
1347 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1349 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1351 journal_file_post_change(f);
1356 typedef struct ChainCacheItem {
1357 uint64_t first; /* the array at the begin of the chain */
1358 uint64_t array; /* the cached array */
1359 uint64_t begin; /* the first item in the cached array */
1360 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1363 static void chain_cache_put(
1372 /* If the chain item to cache for this chain is the
1373 * first one it's not worth caching anything */
1377 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1378 ci = hashmap_steal_first(h);
1380 ci = new(ChainCacheItem, 1);
1387 if (hashmap_put(h, &ci->first, ci) < 0) {
1392 assert(ci->first == first);
1399 static int generic_array_get(JournalFile *f,
1402 Object **ret, uint64_t *offset) {
1405 uint64_t p = 0, a, t = 0;
1413 /* Try the chain cache first */
1414 ci = hashmap_get(f->chain_cache, &first);
1415 if (ci && i > ci->total) {
1424 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1428 k = journal_file_entry_array_n_items(o);
1430 p = le64toh(o->entry_array.items[i]);
1436 a = le64toh(o->entry_array.next_entry_array_offset);
1442 /* Let's cache this item for the next invocation */
1443 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1445 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1458 static int generic_array_get_plus_one(JournalFile *f,
1462 Object **ret, uint64_t *offset) {
1471 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1484 return generic_array_get(f, first, i-1, ret, offset);
1493 static int generic_array_bisect(JournalFile *f,
1497 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1498 direction_t direction,
1503 uint64_t a, p, t = 0, i = 0, last_p = 0;
1504 bool subtract_one = false;
1505 Object *o, *array = NULL;
1510 assert(test_object);
1512 /* Start with the first array in the chain */
1515 ci = hashmap_get(f->chain_cache, &first);
1516 if (ci && n > ci->total) {
1517 /* Ah, we have iterated this bisection array chain
1518 * previously! Let's see if we can skip ahead in the
1519 * chain, as far as the last time. But we can't jump
1520 * backwards in the chain, so let's check that
1523 r = test_object(f, ci->begin, needle);
1527 if (r == TEST_LEFT) {
1528 /* OK, what we are looking for is right of th
1529 * begin of this EntryArray, so let's jump
1530 * straight to previously cached array in the
1540 uint64_t left, right, k, lp;
1542 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1546 k = journal_file_entry_array_n_items(array);
1552 lp = p = le64toh(array->entry_array.items[i]);
1556 r = test_object(f, p, needle);
1560 if (r == TEST_FOUND)
1561 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1563 if (r == TEST_RIGHT) {
1567 if (left == right) {
1568 if (direction == DIRECTION_UP)
1569 subtract_one = true;
1575 assert(left < right);
1577 i = (left + right) / 2;
1578 p = le64toh(array->entry_array.items[i]);
1582 r = test_object(f, p, needle);
1586 if (r == TEST_FOUND)
1587 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1589 if (r == TEST_RIGHT)
1597 if (direction == DIRECTION_UP) {
1599 subtract_one = true;
1610 a = le64toh(array->entry_array.next_entry_array_offset);
1616 if (subtract_one && t == 0 && i == 0)
1619 /* Let's cache this item for the next invocation */
1620 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1622 if (subtract_one && i == 0)
1624 else if (subtract_one)
1625 p = le64toh(array->entry_array.items[i-1]);
1627 p = le64toh(array->entry_array.items[i]);
1629 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1640 *idx = t + i + (subtract_one ? -1 : 0);
1645 static int generic_array_bisect_plus_one(JournalFile *f,
1650 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1651 direction_t direction,
1657 bool step_back = false;
1661 assert(test_object);
1666 /* This bisects the array in object 'first', but first checks
1668 r = test_object(f, extra, needle);
1672 if (r == TEST_FOUND)
1673 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1675 /* if we are looking with DIRECTION_UP then we need to first
1676 see if in the actual array there is a matching entry, and
1677 return the last one of that. But if there isn't any we need
1678 to return this one. Hence remember this, and return it
1681 step_back = direction == DIRECTION_UP;
1683 if (r == TEST_RIGHT) {
1684 if (direction == DIRECTION_DOWN)
1690 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1692 if (r == 0 && step_back)
1701 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1717 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1723 else if (p < needle)
1729 int journal_file_move_to_entry_by_offset(
1732 direction_t direction,
1736 return generic_array_bisect(f,
1737 le64toh(f->header->entry_array_offset),
1738 le64toh(f->header->n_entries),
1746 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1753 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1757 if (le64toh(o->entry.seqnum) == needle)
1759 else if (le64toh(o->entry.seqnum) < needle)
1765 int journal_file_move_to_entry_by_seqnum(
1768 direction_t direction,
1772 return generic_array_bisect(f,
1773 le64toh(f->header->entry_array_offset),
1774 le64toh(f->header->n_entries),
1781 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1788 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1792 if (le64toh(o->entry.realtime) == needle)
1794 else if (le64toh(o->entry.realtime) < needle)
1800 int journal_file_move_to_entry_by_realtime(
1803 direction_t direction,
1807 return generic_array_bisect(f,
1808 le64toh(f->header->entry_array_offset),
1809 le64toh(f->header->n_entries),
1811 test_object_realtime,
1816 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1823 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1827 if (le64toh(o->entry.monotonic) == needle)
1829 else if (le64toh(o->entry.monotonic) < needle)
1835 static inline int find_data_object_by_boot_id(
1840 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1842 sd_id128_to_string(boot_id, t + 9);
1843 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1846 int journal_file_move_to_entry_by_monotonic(
1850 direction_t direction,
1859 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
1865 return generic_array_bisect_plus_one(f,
1866 le64toh(o->data.entry_offset),
1867 le64toh(o->data.entry_array_offset),
1868 le64toh(o->data.n_entries),
1870 test_object_monotonic,
1875 int journal_file_next_entry(
1877 Object *o, uint64_t p,
1878 direction_t direction,
1879 Object **ret, uint64_t *offset) {
1885 assert(p > 0 || !o);
1887 n = le64toh(f->header->n_entries);
1892 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1894 if (o->object.type != OBJECT_ENTRY)
1897 r = generic_array_bisect(f,
1898 le64toh(f->header->entry_array_offset),
1899 le64toh(f->header->n_entries),
1908 if (direction == DIRECTION_DOWN) {
1921 /* And jump to it */
1922 return generic_array_get(f,
1923 le64toh(f->header->entry_array_offset),
1928 int journal_file_skip_entry(
1930 Object *o, uint64_t p,
1932 Object **ret, uint64_t *offset) {
1941 if (o->object.type != OBJECT_ENTRY)
1944 r = generic_array_bisect(f,
1945 le64toh(f->header->entry_array_offset),
1946 le64toh(f->header->n_entries),
1955 /* Calculate new index */
1957 if ((uint64_t) -skip >= i)
1960 i = i - (uint64_t) -skip;
1962 i += (uint64_t) skip;
1964 n = le64toh(f->header->n_entries);
1971 return generic_array_get(f,
1972 le64toh(f->header->entry_array_offset),
1977 int journal_file_next_entry_for_data(
1979 Object *o, uint64_t p,
1980 uint64_t data_offset,
1981 direction_t direction,
1982 Object **ret, uint64_t *offset) {
1989 assert(p > 0 || !o);
1991 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1995 n = le64toh(d->data.n_entries);
2000 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2002 if (o->object.type != OBJECT_ENTRY)
2005 r = generic_array_bisect_plus_one(f,
2006 le64toh(d->data.entry_offset),
2007 le64toh(d->data.entry_array_offset),
2008 le64toh(d->data.n_entries),
2018 if (direction == DIRECTION_DOWN) {
2032 return generic_array_get_plus_one(f,
2033 le64toh(d->data.entry_offset),
2034 le64toh(d->data.entry_array_offset),
2039 int journal_file_move_to_entry_by_offset_for_data(
2041 uint64_t data_offset,
2043 direction_t direction,
2044 Object **ret, uint64_t *offset) {
2051 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2055 return generic_array_bisect_plus_one(f,
2056 le64toh(d->data.entry_offset),
2057 le64toh(d->data.entry_array_offset),
2058 le64toh(d->data.n_entries),
2065 int journal_file_move_to_entry_by_monotonic_for_data(
2067 uint64_t data_offset,
2070 direction_t direction,
2071 Object **ret, uint64_t *offset) {
2079 /* First, seek by time */
2080 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
2086 r = generic_array_bisect_plus_one(f,
2087 le64toh(o->data.entry_offset),
2088 le64toh(o->data.entry_array_offset),
2089 le64toh(o->data.n_entries),
2091 test_object_monotonic,
2097 /* And now, continue seeking until we find an entry that
2098 * exists in both bisection arrays */
2104 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2108 r = generic_array_bisect_plus_one(f,
2109 le64toh(d->data.entry_offset),
2110 le64toh(d->data.entry_array_offset),
2111 le64toh(d->data.n_entries),
2119 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2123 r = generic_array_bisect_plus_one(f,
2124 le64toh(o->data.entry_offset),
2125 le64toh(o->data.entry_array_offset),
2126 le64toh(o->data.n_entries),
2150 int journal_file_move_to_entry_by_seqnum_for_data(
2152 uint64_t data_offset,
2154 direction_t direction,
2155 Object **ret, uint64_t *offset) {
2162 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2166 return generic_array_bisect_plus_one(f,
2167 le64toh(d->data.entry_offset),
2168 le64toh(d->data.entry_array_offset),
2169 le64toh(d->data.n_entries),
2176 int journal_file_move_to_entry_by_realtime_for_data(
2178 uint64_t data_offset,
2180 direction_t direction,
2181 Object **ret, uint64_t *offset) {
2188 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2192 return generic_array_bisect_plus_one(f,
2193 le64toh(d->data.entry_offset),
2194 le64toh(d->data.entry_array_offset),
2195 le64toh(d->data.n_entries),
2197 test_object_realtime,
2202 void journal_file_dump(JournalFile *f) {
2209 journal_file_print_header(f);
2211 p = le64toh(f->header->header_size);
2213 r = journal_file_move_to_object(f, -1, p, &o);
2217 switch (o->object.type) {
2220 printf("Type: OBJECT_UNUSED\n");
2224 printf("Type: OBJECT_DATA\n");
2228 printf("Type: OBJECT_FIELD\n");
2232 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2233 le64toh(o->entry.seqnum),
2234 le64toh(o->entry.monotonic),
2235 le64toh(o->entry.realtime));
2238 case OBJECT_FIELD_HASH_TABLE:
2239 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2242 case OBJECT_DATA_HASH_TABLE:
2243 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2246 case OBJECT_ENTRY_ARRAY:
2247 printf("Type: OBJECT_ENTRY_ARRAY\n");
2251 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2252 le64toh(o->tag.seqnum),
2253 le64toh(o->tag.epoch));
2257 printf("Type: unknown (%u)\n", o->object.type);
2261 if (o->object.flags & OBJECT_COMPRESSED)
2262 printf("Flags: COMPRESSED\n");
2264 if (p == le64toh(f->header->tail_object_offset))
2267 p = p + ALIGN64(le64toh(o->object.size));
2272 log_error("File corrupt");
2275 static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2278 x = format_timestamp(buf, l, t);
2284 void journal_file_print_header(JournalFile *f) {
2285 char a[33], b[33], c[33], d[33];
2286 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
2288 char bytes[FORMAT_BYTES_MAX];
2292 printf("File Path: %s\n"
2296 "Sequential Number ID: %s\n"
2298 "Compatible Flags:%s%s\n"
2299 "Incompatible Flags:%s%s\n"
2300 "Header size: %"PRIu64"\n"
2301 "Arena size: %"PRIu64"\n"
2302 "Data Hash Table Size: %"PRIu64"\n"
2303 "Field Hash Table Size: %"PRIu64"\n"
2304 "Rotate Suggested: %s\n"
2305 "Head Sequential Number: %"PRIu64"\n"
2306 "Tail Sequential Number: %"PRIu64"\n"
2307 "Head Realtime Timestamp: %s\n"
2308 "Tail Realtime Timestamp: %s\n"
2309 "Tail Monotonic Timestamp: %s\n"
2310 "Objects: %"PRIu64"\n"
2311 "Entry Objects: %"PRIu64"\n",
2313 sd_id128_to_string(f->header->file_id, a),
2314 sd_id128_to_string(f->header->machine_id, b),
2315 sd_id128_to_string(f->header->boot_id, c),
2316 sd_id128_to_string(f->header->seqnum_id, d),
2317 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2318 f->header->state == STATE_ONLINE ? "ONLINE" :
2319 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2320 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2321 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2322 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2323 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2324 le64toh(f->header->header_size),
2325 le64toh(f->header->arena_size),
2326 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2327 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2328 yes_no(journal_file_rotate_suggested(f, 0)),
2329 le64toh(f->header->head_entry_seqnum),
2330 le64toh(f->header->tail_entry_seqnum),
2331 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2332 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2333 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
2334 le64toh(f->header->n_objects),
2335 le64toh(f->header->n_entries));
2337 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2338 printf("Data Objects: %"PRIu64"\n"
2339 "Data Hash Table Fill: %.1f%%\n",
2340 le64toh(f->header->n_data),
2341 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2343 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2344 printf("Field Objects: %"PRIu64"\n"
2345 "Field Hash Table Fill: %.1f%%\n",
2346 le64toh(f->header->n_fields),
2347 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2349 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2350 printf("Tag Objects: %"PRIu64"\n",
2351 le64toh(f->header->n_tags));
2352 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2353 printf("Entry Array Objects: %"PRIu64"\n",
2354 le64toh(f->header->n_entry_arrays));
2356 if (fstat(f->fd, &st) >= 0)
2357 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2360 int journal_file_open(
2366 JournalMetrics *metrics,
2367 MMapCache *mmap_cache,
2368 JournalFile *template,
2369 JournalFile **ret) {
2373 bool newly_created = false;
2378 if ((flags & O_ACCMODE) != O_RDONLY &&
2379 (flags & O_ACCMODE) != O_RDWR)
2382 if (!endswith(fname, ".journal") &&
2383 !endswith(fname, ".journal~"))
2386 f = new0(JournalFile, 1);
2394 f->prot = prot_from_flags(flags);
2395 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2397 f->compress = compress;
2404 f->mmap = mmap_cache_ref(mmap_cache);
2406 f->mmap = mmap_cache_new();
2413 f->path = strdup(fname);
2419 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2420 if (!f->chain_cache) {
2425 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2431 if (fstat(f->fd, &f->last_stat) < 0) {
2436 if (f->last_stat.st_size == 0 && f->writable) {
2440 /* Let's attach the creation time to the journal file,
2441 * so that the vacuuming code knows the age of this
2442 * file even if the file might end up corrupted one
2443 * day... Ideally we'd just use the creation time many
2444 * file systems maintain for each file, but there is
2445 * currently no usable API to query this, hence let's
2446 * emulate this via extended attributes. If extended
2447 * attributes are not supported we'll just skip this,
2448 * and rely solely on mtime/atime/ctime of the file.*/
2450 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2451 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2455 /* Try to load the FSPRG state, and if we can't, then
2456 * just don't do sealing */
2458 r = journal_file_fss_load(f);
2464 r = journal_file_init_header(f, template);
2468 if (fstat(f->fd, &f->last_stat) < 0) {
2473 newly_created = true;
2476 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2481 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2482 if (f->header == MAP_FAILED) {
2488 if (!newly_created) {
2489 r = journal_file_verify_header(f);
2495 if (!newly_created && f->writable) {
2496 r = journal_file_fss_load(f);
2504 journal_default_metrics(metrics, f->fd);
2505 f->metrics = *metrics;
2506 } else if (template)
2507 f->metrics = template->metrics;
2509 r = journal_file_refresh_header(f);
2515 r = journal_file_hmac_setup(f);
2520 if (newly_created) {
2521 r = journal_file_setup_field_hash_table(f);
2525 r = journal_file_setup_data_hash_table(f);
2530 r = journal_file_append_first_tag(f);
2536 r = journal_file_map_field_hash_table(f);
2540 r = journal_file_map_data_hash_table(f);
2548 journal_file_close(f);
2553 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2554 _cleanup_free_ char *p = NULL;
2556 JournalFile *old_file, *new_file = NULL;
2564 if (!old_file->writable)
2567 if (!endswith(old_file->path, ".journal"))
2570 l = strlen(old_file->path);
2571 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2572 (int) l - 8, old_file->path,
2573 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2574 le64toh((*f)->header->head_entry_seqnum),
2575 le64toh((*f)->header->head_entry_realtime));
2579 r = rename(old_file->path, p);
2583 old_file->header->state = STATE_ARCHIVED;
2585 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2586 journal_file_close(old_file);
2592 int journal_file_open_reliably(
2598 JournalMetrics *metrics,
2599 MMapCache *mmap_cache,
2600 JournalFile *template,
2601 JournalFile **ret) {
2605 _cleanup_free_ char *p = NULL;
2607 r = journal_file_open(fname, flags, mode, compress, seal,
2608 metrics, mmap_cache, template, ret);
2609 if (r != -EBADMSG && /* corrupted */
2610 r != -ENODATA && /* truncated */
2611 r != -EHOSTDOWN && /* other machine */
2612 r != -EPROTONOSUPPORT && /* incompatible feature */
2613 r != -EBUSY && /* unclean shutdown */
2614 r != -ESHUTDOWN /* already archived */)
2617 if ((flags & O_ACCMODE) == O_RDONLY)
2620 if (!(flags & O_CREAT))
2623 if (!endswith(fname, ".journal"))
2626 /* The file is corrupted. Rotate it away and try it again (but only once) */
2629 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2631 (unsigned long long) now(CLOCK_REALTIME),
2635 r = rename(fname, p);
2639 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2641 return journal_file_open(fname, flags, mode, compress, seal,
2642 metrics, mmap_cache, template, ret);
2645 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2647 uint64_t q, xor_hash = 0;
2660 ts.monotonic = le64toh(o->entry.monotonic);
2661 ts.realtime = le64toh(o->entry.realtime);
2663 if (to->tail_entry_monotonic_valid &&
2664 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2667 n = journal_file_entry_n_items(o);
2668 items = alloca(sizeof(EntryItem) * n);
2670 for (i = 0; i < n; i++) {
2677 q = le64toh(o->entry.items[i].object_offset);
2678 le_hash = o->entry.items[i].hash;
2680 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2684 if (le_hash != o->data.hash)
2687 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2690 /* We hit the limit on 32bit machines */
2691 if ((uint64_t) t != l)
2694 if (o->object.flags & OBJECT_COMPRESSED) {
2698 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2701 data = from->compress_buffer;
2704 return -EPROTONOSUPPORT;
2707 data = o->data.payload;
2709 r = journal_file_append_data(to, data, l, &u, &h);
2713 xor_hash ^= le64toh(u->data.hash);
2714 items[i].object_offset = htole64(h);
2715 items[i].hash = u->data.hash;
2717 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2722 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2725 void journal_default_metrics(JournalMetrics *m, int fd) {
2726 uint64_t fs_size = 0;
2728 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2733 if (fstatvfs(fd, &ss) >= 0)
2734 fs_size = ss.f_frsize * ss.f_blocks;
2736 if (m->max_use == (uint64_t) -1) {
2739 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2741 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2742 m->max_use = DEFAULT_MAX_USE_UPPER;
2744 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2745 m->max_use = DEFAULT_MAX_USE_LOWER;
2747 m->max_use = DEFAULT_MAX_USE_LOWER;
2749 m->max_use = PAGE_ALIGN(m->max_use);
2751 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2752 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2755 if (m->max_size == (uint64_t) -1) {
2756 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2758 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2759 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2761 m->max_size = PAGE_ALIGN(m->max_size);
2763 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2764 m->max_size = JOURNAL_FILE_SIZE_MIN;
2766 if (m->max_size*2 > m->max_use)
2767 m->max_use = m->max_size*2;
2769 if (m->min_size == (uint64_t) -1)
2770 m->min_size = JOURNAL_FILE_SIZE_MIN;
2772 m->min_size = PAGE_ALIGN(m->min_size);
2774 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2775 m->min_size = JOURNAL_FILE_SIZE_MIN;
2777 if (m->min_size > m->max_size)
2778 m->max_size = m->min_size;
2781 if (m->keep_free == (uint64_t) -1) {
2784 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
2786 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2787 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2790 m->keep_free = DEFAULT_KEEP_FREE;
2793 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2794 format_bytes(a, sizeof(a), m->max_use),
2795 format_bytes(b, sizeof(b), m->max_size),
2796 format_bytes(c, sizeof(c), m->min_size),
2797 format_bytes(d, sizeof(d), m->keep_free));
2800 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2805 if (f->header->head_entry_realtime == 0)
2808 *from = le64toh(f->header->head_entry_realtime);
2812 if (f->header->tail_entry_realtime == 0)
2815 *to = le64toh(f->header->tail_entry_realtime);
2821 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2829 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
2833 if (le64toh(o->data.n_entries) <= 0)
2837 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2841 *from = le64toh(o->entry.monotonic);
2845 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2849 r = generic_array_get_plus_one(f,
2850 le64toh(o->data.entry_offset),
2851 le64toh(o->data.entry_array_offset),
2852 le64toh(o->data.n_entries)-1,
2857 *to = le64toh(o->entry.monotonic);
2863 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2866 /* If we gained new header fields we gained new features,
2867 * hence suggest a rotation */
2868 if (le64toh(f->header->header_size) < sizeof(Header)) {
2869 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2873 /* Let's check if the hash tables grew over a certain fill
2874 * level (75%, borrowing this value from Java's hash table
2875 * implementation), and if so suggest a rotation. To calculate
2876 * the fill level we need the n_data field, which only exists
2877 * in newer versions. */
2879 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2880 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2881 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
2883 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2884 le64toh(f->header->n_data),
2885 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2886 (unsigned long long) f->last_stat.st_size,
2887 f->last_stat.st_size / le64toh(f->header->n_data));
2891 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2892 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2893 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
2895 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2896 le64toh(f->header->n_fields),
2897 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
2901 /* Are the data objects properly indexed by field objects? */
2902 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2903 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2904 le64toh(f->header->n_data) > 0 &&
2905 le64toh(f->header->n_fields) == 0)
2908 if (max_file_usec > 0) {
2911 h = le64toh(f->header->head_entry_realtime);
2912 t = now(CLOCK_REALTIME);
2914 if (h > 0 && t > h + max_file_usec)