1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 void journal_file_close(JournalFile *f) {
75 /* Write the final tag */
76 if (f->seal && f->writable)
77 journal_file_append_tag(f);
80 /* Sync everything to disk, before we mark the file offline */
81 if (f->mmap && f->fd >= 0)
82 mmap_cache_close_fd(f->mmap, f->fd);
84 if (f->writable && f->fd >= 0)
88 /* Mark the file offline. Don't override the archived state if it already is set */
89 if (f->writable && f->header->state == STATE_ONLINE)
90 f->header->state = STATE_OFFLINE;
92 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
96 close_nointr_nofail(f->fd);
101 mmap_cache_unref(f->mmap);
103 hashmap_free_free(f->chain_cache);
106 free(f->compress_buffer);
111 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
112 else if (f->fsprg_state)
113 free(f->fsprg_state);
118 gcry_md_close(f->hmac);
124 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
132 memcpy(h.signature, HEADER_SIGNATURE, 8);
133 h.header_size = htole64(ALIGN64(sizeof(h)));
135 h.incompatible_flags =
136 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
139 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
141 r = sd_id128_randomize(&h.file_id);
146 h.seqnum_id = template->header->seqnum_id;
147 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
149 h.seqnum_id = h.file_id;
151 k = pwrite(f->fd, &h, sizeof(h), 0);
161 static int journal_file_refresh_header(JournalFile *f) {
167 r = sd_id128_get_machine(&f->header->machine_id);
171 r = sd_id128_get_boot(&boot_id);
175 if (sd_id128_equal(boot_id, f->header->boot_id))
176 f->tail_entry_monotonic_valid = true;
178 f->header->boot_id = boot_id;
180 f->header->state = STATE_ONLINE;
182 /* Sync the online state to disk */
183 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
189 static int journal_file_verify_header(JournalFile *f) {
192 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
195 /* In both read and write mode we refuse to open files with
196 * incompatible flags we don't know */
198 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
199 return -EPROTONOSUPPORT;
201 if (f->header->incompatible_flags != 0)
202 return -EPROTONOSUPPORT;
205 /* When open for writing we refuse to open files with
206 * compatible flags, too */
209 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
210 return -EPROTONOSUPPORT;
212 if (f->header->compatible_flags != 0)
213 return -EPROTONOSUPPORT;
217 if (f->header->state >= _STATE_MAX)
220 /* The first addition was n_data, so check that we are at least this large */
221 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
224 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
227 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
230 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
233 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
234 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
235 !VALID64(le64toh(f->header->tail_object_offset)) ||
236 !VALID64(le64toh(f->header->entry_array_offset)))
239 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
240 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
241 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
242 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
247 sd_id128_t machine_id;
250 r = sd_id128_get_machine(&machine_id);
254 if (!sd_id128_equal(machine_id, f->header->machine_id))
257 state = f->header->state;
259 if (state == STATE_ONLINE) {
260 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
262 } else if (state == STATE_ARCHIVED)
264 else if (state != STATE_OFFLINE) {
265 log_debug("Journal file %s has unknown state %u.", f->path, state);
270 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
272 f->seal = JOURNAL_HEADER_SEALED(f->header);
277 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
278 uint64_t old_size, new_size;
283 /* We assume that this file is not sparse, and we know that
284 * for sure, since we always call posix_fallocate()
288 le64toh(f->header->header_size) +
289 le64toh(f->header->arena_size);
291 new_size = PAGE_ALIGN(offset + size);
292 if (new_size < le64toh(f->header->header_size))
293 new_size = le64toh(f->header->header_size);
295 if (new_size <= old_size)
298 if (f->metrics.max_size > 0 &&
299 new_size > f->metrics.max_size)
302 if (new_size > f->metrics.min_size &&
303 f->metrics.keep_free > 0) {
306 if (fstatvfs(f->fd, &svfs) >= 0) {
309 available = svfs.f_bfree * svfs.f_bsize;
311 if (available >= f->metrics.keep_free)
312 available -= f->metrics.keep_free;
316 if (new_size - old_size > available)
321 /* Note that the glibc fallocate() fallback is very
322 inefficient, hence we try to minimize the allocation area
324 r = posix_fallocate(f->fd, old_size, new_size - old_size);
328 if (fstat(f->fd, &f->last_stat) < 0)
331 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
336 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
343 /* Avoid SIGBUS on invalid accesses */
344 if (offset + size > (uint64_t) f->last_stat.st_size) {
345 /* Hmm, out of range? Let's refresh the fstat() data
346 * first, before we trust that check. */
348 if (fstat(f->fd, &f->last_stat) < 0 ||
349 offset + size > (uint64_t) f->last_stat.st_size)
350 return -EADDRNOTAVAIL;
353 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
356 static uint64_t minimum_header_size(Object *o) {
358 static uint64_t table[] = {
359 [OBJECT_DATA] = sizeof(DataObject),
360 [OBJECT_FIELD] = sizeof(FieldObject),
361 [OBJECT_ENTRY] = sizeof(EntryObject),
362 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
363 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
364 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
365 [OBJECT_TAG] = sizeof(TagObject),
368 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
369 return sizeof(ObjectHeader);
371 return table[o->object.type];
374 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
384 /* Objects may only be located at multiple of 64 bit */
385 if (!VALID64(offset))
388 /* One context for each type, plus one catch-all for the rest */
389 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
391 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
396 s = le64toh(o->object.size);
398 if (s < sizeof(ObjectHeader))
401 if (o->object.type <= OBJECT_UNUSED)
404 if (s < minimum_header_size(o))
407 if (type > 0 && o->object.type != type)
410 if (s > sizeof(ObjectHeader)) {
411 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
422 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
427 r = le64toh(f->header->tail_entry_seqnum) + 1;
430 /* If an external seqnum counter was passed, we update
431 * both the local and the external one, and set it to
432 * the maximum of both */
440 f->header->tail_entry_seqnum = htole64(r);
442 if (f->header->head_entry_seqnum == 0)
443 f->header->head_entry_seqnum = htole64(r);
448 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
455 assert(type > 0 && type < _OBJECT_TYPE_MAX);
456 assert(size >= sizeof(ObjectHeader));
460 p = le64toh(f->header->tail_object_offset);
462 p = le64toh(f->header->header_size);
464 r = journal_file_move_to_object(f, -1, p, &tail);
468 p += ALIGN64(le64toh(tail->object.size));
471 r = journal_file_allocate(f, p, size);
475 r = journal_file_move_to(f, type, false, p, size, &t);
482 o->object.type = type;
483 o->object.size = htole64(size);
485 f->header->tail_object_offset = htole64(p);
486 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
494 static int journal_file_setup_data_hash_table(JournalFile *f) {
501 /* We estimate that we need 1 hash table entry per 768 of
502 journal file and we want to make sure we never get beyond
503 75% fill level. Calculate the hash table size for the
504 maximum file size based on these metrics. */
506 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
507 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
508 s = DEFAULT_DATA_HASH_TABLE_SIZE;
510 log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
512 r = journal_file_append_object(f,
513 OBJECT_DATA_HASH_TABLE,
514 offsetof(Object, hash_table.items) + s,
519 memset(o->hash_table.items, 0, s);
521 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
522 f->header->data_hash_table_size = htole64(s);
527 static int journal_file_setup_field_hash_table(JournalFile *f) {
534 /* We use a fixed size hash table for the fields as this
535 * number should grow very slowly only */
537 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
538 r = journal_file_append_object(f,
539 OBJECT_FIELD_HASH_TABLE,
540 offsetof(Object, hash_table.items) + s,
545 memset(o->hash_table.items, 0, s);
547 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
548 f->header->field_hash_table_size = htole64(s);
553 static int journal_file_map_data_hash_table(JournalFile *f) {
560 p = le64toh(f->header->data_hash_table_offset);
561 s = le64toh(f->header->data_hash_table_size);
563 r = journal_file_move_to(f,
564 OBJECT_DATA_HASH_TABLE,
571 f->data_hash_table = t;
575 static int journal_file_map_field_hash_table(JournalFile *f) {
582 p = le64toh(f->header->field_hash_table_offset);
583 s = le64toh(f->header->field_hash_table_size);
585 r = journal_file_move_to(f,
586 OBJECT_FIELD_HASH_TABLE,
593 f->field_hash_table = t;
597 static int journal_file_link_field(
610 if (o->object.type != OBJECT_FIELD)
613 /* This might alter the window we are looking at */
615 o->field.next_hash_offset = o->field.head_data_offset = 0;
617 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
618 p = le64toh(f->field_hash_table[h].tail_hash_offset);
620 f->field_hash_table[h].head_hash_offset = htole64(offset);
622 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
626 o->field.next_hash_offset = htole64(offset);
629 f->field_hash_table[h].tail_hash_offset = htole64(offset);
631 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
632 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
637 static int journal_file_link_data(
650 if (o->object.type != OBJECT_DATA)
653 /* This might alter the window we are looking at */
655 o->data.next_hash_offset = o->data.next_field_offset = 0;
656 o->data.entry_offset = o->data.entry_array_offset = 0;
657 o->data.n_entries = 0;
659 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
660 p = le64toh(f->data_hash_table[h].tail_hash_offset);
662 /* Only entry in the hash table is easy */
663 f->data_hash_table[h].head_hash_offset = htole64(offset);
665 /* Move back to the previous data object, to patch in
668 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
672 o->data.next_hash_offset = htole64(offset);
675 f->data_hash_table[h].tail_hash_offset = htole64(offset);
677 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
678 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
683 int journal_file_find_field_object_with_hash(
685 const void *field, uint64_t size, uint64_t hash,
686 Object **ret, uint64_t *offset) {
688 uint64_t p, osize, h;
692 assert(field && size > 0);
694 osize = offsetof(Object, field.payload) + size;
696 if (f->header->field_hash_table_size == 0)
699 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
700 p = le64toh(f->field_hash_table[h].head_hash_offset);
705 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
709 if (le64toh(o->field.hash) == hash &&
710 le64toh(o->object.size) == osize &&
711 memcmp(o->field.payload, field, size) == 0) {
721 p = le64toh(o->field.next_hash_offset);
727 int journal_file_find_field_object(
729 const void *field, uint64_t size,
730 Object **ret, uint64_t *offset) {
735 assert(field && size > 0);
737 hash = hash64(field, size);
739 return journal_file_find_field_object_with_hash(f,
744 int journal_file_find_data_object_with_hash(
746 const void *data, uint64_t size, uint64_t hash,
747 Object **ret, uint64_t *offset) {
749 uint64_t p, osize, h;
753 assert(data || size == 0);
755 osize = offsetof(Object, data.payload) + size;
757 if (f->header->data_hash_table_size == 0)
760 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
761 p = le64toh(f->data_hash_table[h].head_hash_offset);
766 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
770 if (le64toh(o->data.hash) != hash)
773 if (o->object.flags & OBJECT_COMPRESSED) {
777 l = le64toh(o->object.size);
778 if (l <= offsetof(Object, data.payload))
781 l -= offsetof(Object, data.payload);
783 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
787 memcmp(f->compress_buffer, data, size) == 0) {
798 return -EPROTONOSUPPORT;
801 } else if (le64toh(o->object.size) == osize &&
802 memcmp(o->data.payload, data, size) == 0) {
814 p = le64toh(o->data.next_hash_offset);
820 int journal_file_find_data_object(
822 const void *data, uint64_t size,
823 Object **ret, uint64_t *offset) {
828 assert(data || size == 0);
830 hash = hash64(data, size);
832 return journal_file_find_data_object_with_hash(f,
837 static int journal_file_append_field(
839 const void *field, uint64_t size,
840 Object **ret, uint64_t *offset) {
848 assert(field && size > 0);
850 hash = hash64(field, size);
852 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
866 osize = offsetof(Object, field.payload) + size;
867 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
869 o->field.hash = htole64(hash);
870 memcpy(o->field.payload, field, size);
872 r = journal_file_link_field(f, o, p, hash);
876 /* The linking might have altered the window, so let's
877 * refresh our pointer */
878 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
883 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
897 static int journal_file_append_data(
899 const void *data, uint64_t size,
900 Object **ret, uint64_t *offset) {
906 bool compressed = false;
910 assert(data || size == 0);
912 hash = hash64(data, size);
914 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
928 osize = offsetof(Object, data.payload) + size;
929 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
933 o->data.hash = htole64(hash);
937 size >= COMPRESSION_SIZE_THRESHOLD) {
940 compressed = compress_blob(data, size, o->data.payload, &rsize);
943 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
944 o->object.flags |= OBJECT_COMPRESSED;
946 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
951 if (!compressed && size > 0)
952 memcpy(o->data.payload, data, size);
954 r = journal_file_link_data(f, o, p, hash);
958 /* The linking might have altered the window, so let's
959 * refresh our pointer */
960 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
964 eq = memchr(data, '=', size);
965 if (eq && eq > data) {
969 /* Create field object ... */
970 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
974 /* ... and link it in. */
975 o->data.next_field_offset = fo->field.head_data_offset;
976 fo->field.head_data_offset = le64toh(p);
980 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
994 uint64_t journal_file_entry_n_items(Object *o) {
997 if (o->object.type != OBJECT_ENTRY)
1000 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1003 uint64_t journal_file_entry_array_n_items(Object *o) {
1006 if (o->object.type != OBJECT_ENTRY_ARRAY)
1009 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1012 uint64_t journal_file_hash_table_n_items(Object *o) {
1015 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1016 o->object.type != OBJECT_FIELD_HASH_TABLE)
1019 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1022 static int link_entry_into_array(JournalFile *f,
1027 uint64_t n = 0, ap = 0, q, i, a, hidx;
1035 a = le64toh(*first);
1036 i = hidx = le64toh(*idx);
1039 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1043 n = journal_file_entry_array_n_items(o);
1045 o->entry_array.items[i] = htole64(p);
1046 *idx = htole64(hidx + 1);
1052 a = le64toh(o->entry_array.next_entry_array_offset);
1063 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1064 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1070 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1075 o->entry_array.items[i] = htole64(p);
1078 *first = htole64(q);
1080 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1084 o->entry_array.next_entry_array_offset = htole64(q);
1087 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1088 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1090 *idx = htole64(hidx + 1);
1095 static int link_entry_into_array_plus_one(JournalFile *f,
1110 *extra = htole64(p);
1114 i = htole64(le64toh(*idx) - 1);
1115 r = link_entry_into_array(f, first, &i, p);
1120 *idx = htole64(le64toh(*idx) + 1);
1124 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1131 p = le64toh(o->entry.items[i].object_offset);
1135 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1139 return link_entry_into_array_plus_one(f,
1140 &o->data.entry_offset,
1141 &o->data.entry_array_offset,
1146 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1154 if (o->object.type != OBJECT_ENTRY)
1157 __sync_synchronize();
1159 /* Link up the entry itself */
1160 r = link_entry_into_array(f,
1161 &f->header->entry_array_offset,
1162 &f->header->n_entries,
1167 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
1169 if (f->header->head_entry_realtime == 0)
1170 f->header->head_entry_realtime = o->entry.realtime;
1172 f->header->tail_entry_realtime = o->entry.realtime;
1173 f->header->tail_entry_monotonic = o->entry.monotonic;
1175 f->tail_entry_monotonic_valid = true;
1177 /* Link up the items */
1178 n = journal_file_entry_n_items(o);
1179 for (i = 0; i < n; i++) {
1180 r = journal_file_link_entry_item(f, o, offset, i);
1188 static int journal_file_append_entry_internal(
1190 const dual_timestamp *ts,
1192 const EntryItem items[], unsigned n_items,
1194 Object **ret, uint64_t *offset) {
1201 assert(items || n_items == 0);
1204 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1206 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1210 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1211 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1212 o->entry.realtime = htole64(ts->realtime);
1213 o->entry.monotonic = htole64(ts->monotonic);
1214 o->entry.xor_hash = htole64(xor_hash);
1215 o->entry.boot_id = f->header->boot_id;
1218 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1223 r = journal_file_link_entry(f, o, np);
1236 void journal_file_post_change(JournalFile *f) {
1239 /* inotify() does not receive IN_MODIFY events from file
1240 * accesses done via mmap(). After each access we hence
1241 * trigger IN_MODIFY by truncating the journal file to its
1242 * current size which triggers IN_MODIFY. */
1244 __sync_synchronize();
1246 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1247 log_error("Failed to truncate file to its own size: %m");
1250 static int entry_item_cmp(const void *_a, const void *_b) {
1251 const EntryItem *a = _a, *b = _b;
1253 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1255 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1260 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1264 uint64_t xor_hash = 0;
1265 struct dual_timestamp _ts;
1268 assert(iovec || n_iovec == 0);
1274 dual_timestamp_get(&_ts);
1278 if (f->tail_entry_monotonic_valid &&
1279 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1283 r = journal_file_maybe_append_tag(f, ts->realtime);
1288 /* alloca() can't take 0, hence let's allocate at least one */
1289 items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1291 for (i = 0; i < n_iovec; i++) {
1295 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1299 xor_hash ^= le64toh(o->data.hash);
1300 items[i].object_offset = htole64(p);
1301 items[i].hash = o->data.hash;
1304 /* Order by the position on disk, in order to improve seek
1305 * times for rotating media. */
1306 qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1308 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1310 journal_file_post_change(f);
1315 typedef struct ChainCacheItem {
1316 uint64_t first; /* the array at the begin of the chain */
1317 uint64_t array; /* the cached array */
1318 uint64_t begin; /* the first item in the cached array */
1319 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1322 static void chain_cache_put(
1331 /* If the chain item to cache for this chain is the
1332 * first one it's not worth caching anything */
1336 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1337 ci = hashmap_steal_first(h);
1339 ci = new(ChainCacheItem, 1);
1346 if (hashmap_put(h, &ci->first, ci) < 0) {
1351 assert(ci->first == first);
1358 static int generic_array_get(JournalFile *f,
1361 Object **ret, uint64_t *offset) {
1364 uint64_t p = 0, a, t = 0;
1372 /* Try the chain cache first */
1373 ci = hashmap_get(f->chain_cache, &first);
1374 if (ci && i > ci->total) {
1383 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1387 k = journal_file_entry_array_n_items(o);
1389 p = le64toh(o->entry_array.items[i]);
1395 a = le64toh(o->entry_array.next_entry_array_offset);
1401 /* Let's cache this item for the next invocation */
1402 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1404 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1417 static int generic_array_get_plus_one(JournalFile *f,
1421 Object **ret, uint64_t *offset) {
1430 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1443 return generic_array_get(f, first, i-1, ret, offset);
1452 static int generic_array_bisect(JournalFile *f,
1456 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1457 direction_t direction,
1462 uint64_t a, p, t = 0, i = 0, last_p = 0;
1463 bool subtract_one = false;
1464 Object *o, *array = NULL;
1469 assert(test_object);
1471 /* Start with the first array in the chain */
1474 ci = hashmap_get(f->chain_cache, &first);
1475 if (ci && n > ci->total) {
1476 /* Ah, we have iterated this bisection array chain
1477 * previously! Let's see if we can skip ahead in the
1478 * chain, as far as the last time. But we can't jump
1479 * backwards in the chain, so let's check that
1482 r = test_object(f, ci->begin, needle);
1486 if (r == TEST_LEFT) {
1487 /* OK, what we are looking for is right of th
1488 * begin of this EntryArray, so let's jump
1489 * straight to previously cached array in the
1499 uint64_t left, right, k, lp;
1501 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1505 k = journal_file_entry_array_n_items(array);
1511 lp = p = le64toh(array->entry_array.items[i]);
1515 r = test_object(f, p, needle);
1519 if (r == TEST_FOUND)
1520 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1522 if (r == TEST_RIGHT) {
1526 if (left == right) {
1527 if (direction == DIRECTION_UP)
1528 subtract_one = true;
1534 assert(left < right);
1536 i = (left + right) / 2;
1537 p = le64toh(array->entry_array.items[i]);
1541 r = test_object(f, p, needle);
1545 if (r == TEST_FOUND)
1546 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1548 if (r == TEST_RIGHT)
1556 if (direction == DIRECTION_UP) {
1558 subtract_one = true;
1569 a = le64toh(array->entry_array.next_entry_array_offset);
1575 if (subtract_one && t == 0 && i == 0)
1578 /* Let's cache this item for the next invocation */
1579 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1581 if (subtract_one && i == 0)
1583 else if (subtract_one)
1584 p = le64toh(array->entry_array.items[i-1]);
1586 p = le64toh(array->entry_array.items[i]);
1588 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1599 *idx = t + i + (subtract_one ? -1 : 0);
1604 static int generic_array_bisect_plus_one(JournalFile *f,
1609 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1610 direction_t direction,
1616 bool step_back = false;
1620 assert(test_object);
1625 /* This bisects the array in object 'first', but first checks
1627 r = test_object(f, extra, needle);
1631 if (r == TEST_FOUND)
1632 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1634 /* if we are looking with DIRECTION_UP then we need to first
1635 see if in the actual array there is a matching entry, and
1636 return the last one of that. But if there isn't any we need
1637 to return this one. Hence remember this, and return it
1640 step_back = direction == DIRECTION_UP;
1642 if (r == TEST_RIGHT) {
1643 if (direction == DIRECTION_DOWN)
1649 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1651 if (r == 0 && step_back)
1660 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1676 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1682 else if (p < needle)
1688 int journal_file_move_to_entry_by_offset(
1691 direction_t direction,
1695 return generic_array_bisect(f,
1696 le64toh(f->header->entry_array_offset),
1697 le64toh(f->header->n_entries),
1705 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1712 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1716 if (le64toh(o->entry.seqnum) == needle)
1718 else if (le64toh(o->entry.seqnum) < needle)
1724 int journal_file_move_to_entry_by_seqnum(
1727 direction_t direction,
1731 return generic_array_bisect(f,
1732 le64toh(f->header->entry_array_offset),
1733 le64toh(f->header->n_entries),
1740 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1747 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1751 if (le64toh(o->entry.realtime) == needle)
1753 else if (le64toh(o->entry.realtime) < needle)
1759 int journal_file_move_to_entry_by_realtime(
1762 direction_t direction,
1766 return generic_array_bisect(f,
1767 le64toh(f->header->entry_array_offset),
1768 le64toh(f->header->n_entries),
1770 test_object_realtime,
1775 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1782 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1786 if (le64toh(o->entry.monotonic) == needle)
1788 else if (le64toh(o->entry.monotonic) < needle)
1794 int journal_file_move_to_entry_by_monotonic(
1798 direction_t direction,
1802 char t[9+32+1] = "_BOOT_ID=";
1808 sd_id128_to_string(boot_id, t + 9);
1809 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1815 return generic_array_bisect_plus_one(f,
1816 le64toh(o->data.entry_offset),
1817 le64toh(o->data.entry_array_offset),
1818 le64toh(o->data.n_entries),
1820 test_object_monotonic,
1825 int journal_file_next_entry(
1827 Object *o, uint64_t p,
1828 direction_t direction,
1829 Object **ret, uint64_t *offset) {
1835 assert(p > 0 || !o);
1837 n = le64toh(f->header->n_entries);
1842 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1844 if (o->object.type != OBJECT_ENTRY)
1847 r = generic_array_bisect(f,
1848 le64toh(f->header->entry_array_offset),
1849 le64toh(f->header->n_entries),
1858 if (direction == DIRECTION_DOWN) {
1871 /* And jump to it */
1872 return generic_array_get(f,
1873 le64toh(f->header->entry_array_offset),
1878 int journal_file_skip_entry(
1880 Object *o, uint64_t p,
1882 Object **ret, uint64_t *offset) {
1891 if (o->object.type != OBJECT_ENTRY)
1894 r = generic_array_bisect(f,
1895 le64toh(f->header->entry_array_offset),
1896 le64toh(f->header->n_entries),
1905 /* Calculate new index */
1907 if ((uint64_t) -skip >= i)
1910 i = i - (uint64_t) -skip;
1912 i += (uint64_t) skip;
1914 n = le64toh(f->header->n_entries);
1921 return generic_array_get(f,
1922 le64toh(f->header->entry_array_offset),
1927 int journal_file_next_entry_for_data(
1929 Object *o, uint64_t p,
1930 uint64_t data_offset,
1931 direction_t direction,
1932 Object **ret, uint64_t *offset) {
1939 assert(p > 0 || !o);
1941 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1945 n = le64toh(d->data.n_entries);
1950 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1952 if (o->object.type != OBJECT_ENTRY)
1955 r = generic_array_bisect_plus_one(f,
1956 le64toh(d->data.entry_offset),
1957 le64toh(d->data.entry_array_offset),
1958 le64toh(d->data.n_entries),
1968 if (direction == DIRECTION_DOWN) {
1982 return generic_array_get_plus_one(f,
1983 le64toh(d->data.entry_offset),
1984 le64toh(d->data.entry_array_offset),
1989 int journal_file_move_to_entry_by_offset_for_data(
1991 uint64_t data_offset,
1993 direction_t direction,
1994 Object **ret, uint64_t *offset) {
2001 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2005 return generic_array_bisect_plus_one(f,
2006 le64toh(d->data.entry_offset),
2007 le64toh(d->data.entry_array_offset),
2008 le64toh(d->data.n_entries),
2015 int journal_file_move_to_entry_by_monotonic_for_data(
2017 uint64_t data_offset,
2020 direction_t direction,
2021 Object **ret, uint64_t *offset) {
2023 char t[9+32+1] = "_BOOT_ID=";
2030 /* First, seek by time */
2031 sd_id128_to_string(boot_id, t + 9);
2032 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
2038 r = generic_array_bisect_plus_one(f,
2039 le64toh(o->data.entry_offset),
2040 le64toh(o->data.entry_array_offset),
2041 le64toh(o->data.n_entries),
2043 test_object_monotonic,
2049 /* And now, continue seeking until we find an entry that
2050 * exists in both bisection arrays */
2056 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2060 r = generic_array_bisect_plus_one(f,
2061 le64toh(d->data.entry_offset),
2062 le64toh(d->data.entry_array_offset),
2063 le64toh(d->data.n_entries),
2071 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2075 r = generic_array_bisect_plus_one(f,
2076 le64toh(o->data.entry_offset),
2077 le64toh(o->data.entry_array_offset),
2078 le64toh(o->data.n_entries),
2102 int journal_file_move_to_entry_by_seqnum_for_data(
2104 uint64_t data_offset,
2106 direction_t direction,
2107 Object **ret, uint64_t *offset) {
2114 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2118 return generic_array_bisect_plus_one(f,
2119 le64toh(d->data.entry_offset),
2120 le64toh(d->data.entry_array_offset),
2121 le64toh(d->data.n_entries),
2128 int journal_file_move_to_entry_by_realtime_for_data(
2130 uint64_t data_offset,
2132 direction_t direction,
2133 Object **ret, uint64_t *offset) {
2140 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2144 return generic_array_bisect_plus_one(f,
2145 le64toh(d->data.entry_offset),
2146 le64toh(d->data.entry_array_offset),
2147 le64toh(d->data.n_entries),
2149 test_object_realtime,
2154 void journal_file_dump(JournalFile *f) {
2161 journal_file_print_header(f);
2163 p = le64toh(f->header->header_size);
2165 r = journal_file_move_to_object(f, -1, p, &o);
2169 switch (o->object.type) {
2172 printf("Type: OBJECT_UNUSED\n");
2176 printf("Type: OBJECT_DATA\n");
2180 printf("Type: OBJECT_FIELD\n");
2184 printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
2185 (unsigned long long) le64toh(o->entry.seqnum),
2186 (unsigned long long) le64toh(o->entry.monotonic),
2187 (unsigned long long) le64toh(o->entry.realtime));
2190 case OBJECT_FIELD_HASH_TABLE:
2191 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2194 case OBJECT_DATA_HASH_TABLE:
2195 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2198 case OBJECT_ENTRY_ARRAY:
2199 printf("Type: OBJECT_ENTRY_ARRAY\n");
2203 printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
2204 (unsigned long long) le64toh(o->tag.seqnum),
2205 (unsigned long long) le64toh(o->tag.epoch));
2209 printf("Type: unknown (%u)\n", o->object.type);
2213 if (o->object.flags & OBJECT_COMPRESSED)
2214 printf("Flags: COMPRESSED\n");
2216 if (p == le64toh(f->header->tail_object_offset))
2219 p = p + ALIGN64(le64toh(o->object.size));
2224 log_error("File corrupt");
2227 void journal_file_print_header(JournalFile *f) {
2228 char a[33], b[33], c[33];
2229 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2231 char bytes[FORMAT_BYTES_MAX];
2235 printf("File Path: %s\n"
2239 "Sequential Number ID: %s\n"
2241 "Compatible Flags:%s%s\n"
2242 "Incompatible Flags:%s%s\n"
2243 "Header size: %llu\n"
2244 "Arena size: %llu\n"
2245 "Data Hash Table Size: %llu\n"
2246 "Field Hash Table Size: %llu\n"
2247 "Rotate Suggested: %s\n"
2248 "Head Sequential Number: %llu\n"
2249 "Tail Sequential Number: %llu\n"
2250 "Head Realtime Timestamp: %s\n"
2251 "Tail Realtime Timestamp: %s\n"
2253 "Entry Objects: %llu\n",
2255 sd_id128_to_string(f->header->file_id, a),
2256 sd_id128_to_string(f->header->machine_id, b),
2257 sd_id128_to_string(f->header->boot_id, c),
2258 sd_id128_to_string(f->header->seqnum_id, c),
2259 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2260 f->header->state == STATE_ONLINE ? "ONLINE" :
2261 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2262 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2263 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2264 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2265 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2266 (unsigned long long) le64toh(f->header->header_size),
2267 (unsigned long long) le64toh(f->header->arena_size),
2268 (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2269 (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2270 yes_no(journal_file_rotate_suggested(f, 0)),
2271 (unsigned long long) le64toh(f->header->head_entry_seqnum),
2272 (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2273 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2274 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2275 (unsigned long long) le64toh(f->header->n_objects),
2276 (unsigned long long) le64toh(f->header->n_entries));
2278 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2279 printf("Data Objects: %llu\n"
2280 "Data Hash Table Fill: %.1f%%\n",
2281 (unsigned long long) le64toh(f->header->n_data),
2282 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2284 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2285 printf("Field Objects: %llu\n"
2286 "Field Hash Table Fill: %.1f%%\n",
2287 (unsigned long long) le64toh(f->header->n_fields),
2288 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2290 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2291 printf("Tag Objects: %llu\n",
2292 (unsigned long long) le64toh(f->header->n_tags));
2293 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2294 printf("Entry Array Objects: %llu\n",
2295 (unsigned long long) le64toh(f->header->n_entry_arrays));
2297 if (fstat(f->fd, &st) >= 0)
2298 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2301 int journal_file_open(
2307 JournalMetrics *metrics,
2308 MMapCache *mmap_cache,
2309 JournalFile *template,
2310 JournalFile **ret) {
2314 bool newly_created = false;
2319 if ((flags & O_ACCMODE) != O_RDONLY &&
2320 (flags & O_ACCMODE) != O_RDWR)
2323 if (!endswith(fname, ".journal") &&
2324 !endswith(fname, ".journal~"))
2327 f = new0(JournalFile, 1);
2335 f->prot = prot_from_flags(flags);
2336 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2338 f->compress = compress;
2345 f->mmap = mmap_cache_ref(mmap_cache);
2347 f->mmap = mmap_cache_new();
2354 f->path = strdup(fname);
2360 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2361 if (!f->chain_cache) {
2366 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2372 if (fstat(f->fd, &f->last_stat) < 0) {
2377 if (f->last_stat.st_size == 0 && f->writable) {
2381 /* Let's attach the creation time to the journal file,
2382 * so that the vacuuming code knows the age of this
2383 * file even if the file might end up corrupted one
2384 * day... Ideally we'd just use the creation time many
2385 * file systems maintain for each file, but there is
2386 * currently no usable API to query this, hence let's
2387 * emulate this via extended attributes. If extended
2388 * attributes are not supported we'll just skip this,
2389 * and rely solely on mtime/atime/ctime of the file.*/
2391 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2392 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2396 /* Try to load the FSPRG state, and if we can't, then
2397 * just don't do sealing */
2399 r = journal_file_fss_load(f);
2405 r = journal_file_init_header(f, template);
2409 if (fstat(f->fd, &f->last_stat) < 0) {
2414 newly_created = true;
2417 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2422 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2423 if (f->header == MAP_FAILED) {
2429 if (!newly_created) {
2430 r = journal_file_verify_header(f);
2436 if (!newly_created && f->writable) {
2437 r = journal_file_fss_load(f);
2445 journal_default_metrics(metrics, f->fd);
2446 f->metrics = *metrics;
2447 } else if (template)
2448 f->metrics = template->metrics;
2450 r = journal_file_refresh_header(f);
2456 r = journal_file_hmac_setup(f);
2461 if (newly_created) {
2462 r = journal_file_setup_field_hash_table(f);
2466 r = journal_file_setup_data_hash_table(f);
2471 r = journal_file_append_first_tag(f);
2477 r = journal_file_map_field_hash_table(f);
2481 r = journal_file_map_data_hash_table(f);
2489 journal_file_close(f);
2494 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2497 JournalFile *old_file, *new_file = NULL;
2505 if (!old_file->writable)
2508 if (!endswith(old_file->path, ".journal"))
2511 l = strlen(old_file->path);
2513 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2517 memcpy(p, old_file->path, l - 8);
2519 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2520 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2521 "-%016llx-%016llx.journal",
2522 (unsigned long long) le64toh((*f)->header->head_entry_seqnum),
2523 (unsigned long long) le64toh((*f)->header->head_entry_realtime));
2525 r = rename(old_file->path, p);
2531 old_file->header->state = STATE_ARCHIVED;
2533 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2534 journal_file_close(old_file);
2540 int journal_file_open_reliably(
2546 JournalMetrics *metrics,
2547 MMapCache *mmap_cache,
2548 JournalFile *template,
2549 JournalFile **ret) {
2555 r = journal_file_open(fname, flags, mode, compress, seal,
2556 metrics, mmap_cache, template, ret);
2557 if (r != -EBADMSG && /* corrupted */
2558 r != -ENODATA && /* truncated */
2559 r != -EHOSTDOWN && /* other machine */
2560 r != -EPROTONOSUPPORT && /* incompatible feature */
2561 r != -EBUSY && /* unclean shutdown */
2562 r != -ESHUTDOWN /* already archived */)
2565 if ((flags & O_ACCMODE) == O_RDONLY)
2568 if (!(flags & O_CREAT))
2571 if (!endswith(fname, ".journal"))
2574 /* The file is corrupted. Rotate it away and try it again (but only once) */
2577 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2579 (unsigned long long) now(CLOCK_REALTIME),
2583 r = rename(fname, p);
2588 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2590 return journal_file_open(fname, flags, mode, compress, seal,
2591 metrics, mmap_cache, template, ret);
2594 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2596 uint64_t q, xor_hash = 0;
2609 ts.monotonic = le64toh(o->entry.monotonic);
2610 ts.realtime = le64toh(o->entry.realtime);
2612 if (to->tail_entry_monotonic_valid &&
2613 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2616 n = journal_file_entry_n_items(o);
2617 items = alloca(sizeof(EntryItem) * n);
2619 for (i = 0; i < n; i++) {
2626 q = le64toh(o->entry.items[i].object_offset);
2627 le_hash = o->entry.items[i].hash;
2629 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2633 if (le_hash != o->data.hash)
2636 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2639 /* We hit the limit on 32bit machines */
2640 if ((uint64_t) t != l)
2643 if (o->object.flags & OBJECT_COMPRESSED) {
2647 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
2650 data = from->compress_buffer;
2653 return -EPROTONOSUPPORT;
2656 data = o->data.payload;
2658 r = journal_file_append_data(to, data, l, &u, &h);
2662 xor_hash ^= le64toh(u->data.hash);
2663 items[i].object_offset = htole64(h);
2664 items[i].hash = u->data.hash;
2666 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2671 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2674 void journal_default_metrics(JournalMetrics *m, int fd) {
2675 uint64_t fs_size = 0;
2677 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2682 if (fstatvfs(fd, &ss) >= 0)
2683 fs_size = ss.f_frsize * ss.f_blocks;
2685 if (m->max_use == (uint64_t) -1) {
2688 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2690 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2691 m->max_use = DEFAULT_MAX_USE_UPPER;
2693 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2694 m->max_use = DEFAULT_MAX_USE_LOWER;
2696 m->max_use = DEFAULT_MAX_USE_LOWER;
2698 m->max_use = PAGE_ALIGN(m->max_use);
2700 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2701 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2704 if (m->max_size == (uint64_t) -1) {
2705 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2707 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2708 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2710 m->max_size = PAGE_ALIGN(m->max_size);
2712 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2713 m->max_size = JOURNAL_FILE_SIZE_MIN;
2715 if (m->max_size*2 > m->max_use)
2716 m->max_use = m->max_size*2;
2718 if (m->min_size == (uint64_t) -1)
2719 m->min_size = JOURNAL_FILE_SIZE_MIN;
2721 m->min_size = PAGE_ALIGN(m->min_size);
2723 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2724 m->min_size = JOURNAL_FILE_SIZE_MIN;
2726 if (m->min_size > m->max_size)
2727 m->max_size = m->min_size;
2730 if (m->keep_free == (uint64_t) -1) {
2733 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2735 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2736 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2739 m->keep_free = DEFAULT_KEEP_FREE;
2742 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2743 format_bytes(a, sizeof(a), m->max_use),
2744 format_bytes(b, sizeof(b), m->max_size),
2745 format_bytes(c, sizeof(c), m->min_size),
2746 format_bytes(d, sizeof(d), m->keep_free));
2749 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2754 if (f->header->head_entry_realtime == 0)
2757 *from = le64toh(f->header->head_entry_realtime);
2761 if (f->header->tail_entry_realtime == 0)
2764 *to = le64toh(f->header->tail_entry_realtime);
2770 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2771 char t[9+32+1] = "_BOOT_ID=";
2779 sd_id128_to_string(boot_id, t + 9);
2781 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2785 if (le64toh(o->data.n_entries) <= 0)
2789 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2793 *from = le64toh(o->entry.monotonic);
2797 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2801 r = generic_array_get_plus_one(f,
2802 le64toh(o->data.entry_offset),
2803 le64toh(o->data.entry_array_offset),
2804 le64toh(o->data.n_entries)-1,
2809 *to = le64toh(o->entry.monotonic);
2815 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2818 /* If we gained new header fields we gained new features,
2819 * hence suggest a rotation */
2820 if (le64toh(f->header->header_size) < sizeof(Header)) {
2821 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2825 /* Let's check if the hash tables grew over a certain fill
2826 * level (75%, borrowing this value from Java's hash table
2827 * implementation), and if so suggest a rotation. To calculate
2828 * the fill level we need the n_data field, which only exists
2829 * in newer versions. */
2831 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2832 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2833 log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2835 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2836 (unsigned long long) le64toh(f->header->n_data),
2837 (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2838 (unsigned long long) (f->last_stat.st_size),
2839 (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2843 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2844 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2845 log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2847 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2848 (unsigned long long) le64toh(f->header->n_fields),
2849 (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2853 /* Are the data objects properly indexed by field objects? */
2854 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2855 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2856 le64toh(f->header->n_data) > 0 &&
2857 le64toh(f->header->n_fields) == 0)
2860 if (max_file_usec > 0) {
2863 h = le64toh(f->header->head_entry_realtime);
2864 t = now(CLOCK_REALTIME);
2866 if (h > 0 && t > h + max_file_usec)