1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 /* How many entries to keep in the entry array chain cache at max */
69 #define CHAIN_CACHE_MAX 20
71 void journal_file_close(JournalFile *f) {
75 /* Write the final tag */
76 if (f->seal && f->writable)
77 journal_file_append_tag(f);
80 /* Sync everything to disk, before we mark the file offline */
81 if (f->mmap && f->fd >= 0)
82 mmap_cache_close_fd(f->mmap, f->fd);
84 if (f->writable && f->fd >= 0)
88 /* Mark the file offline. Don't override the archived state if it already is set */
89 if (f->writable && f->header->state == STATE_ONLINE)
90 f->header->state = STATE_OFFLINE;
92 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
96 close_nointr_nofail(f->fd);
101 mmap_cache_unref(f->mmap);
103 hashmap_free_free(f->chain_cache);
106 free(f->compress_buffer);
111 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
112 else if (f->fsprg_state)
113 free(f->fsprg_state);
118 gcry_md_close(f->hmac);
124 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
132 memcpy(h.signature, HEADER_SIGNATURE, 8);
133 h.header_size = htole64(ALIGN64(sizeof(h)));
135 h.incompatible_flags =
136 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
139 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
141 r = sd_id128_randomize(&h.file_id);
146 h.seqnum_id = template->header->seqnum_id;
147 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
149 h.seqnum_id = h.file_id;
151 k = pwrite(f->fd, &h, sizeof(h), 0);
161 static int journal_file_refresh_header(JournalFile *f) {
167 r = sd_id128_get_machine(&f->header->machine_id);
171 r = sd_id128_get_boot(&boot_id);
175 if (sd_id128_equal(boot_id, f->header->boot_id))
176 f->tail_entry_monotonic_valid = true;
178 f->header->boot_id = boot_id;
180 f->header->state = STATE_ONLINE;
182 /* Sync the online state to disk */
183 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
189 static int journal_file_verify_header(JournalFile *f) {
192 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
195 /* In both read and write mode we refuse to open files with
196 * incompatible flags we don't know */
198 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
199 return -EPROTONOSUPPORT;
201 if (f->header->incompatible_flags != 0)
202 return -EPROTONOSUPPORT;
205 /* When open for writing we refuse to open files with
206 * compatible flags, too */
209 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
210 return -EPROTONOSUPPORT;
212 if (f->header->compatible_flags != 0)
213 return -EPROTONOSUPPORT;
217 if (f->header->state >= _STATE_MAX)
220 /* The first addition was n_data, so check that we are at least this large */
221 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
224 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
227 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
230 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
233 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
234 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
235 !VALID64(le64toh(f->header->tail_object_offset)) ||
236 !VALID64(le64toh(f->header->entry_array_offset)))
239 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
240 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
241 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
242 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
247 sd_id128_t machine_id;
250 r = sd_id128_get_machine(&machine_id);
254 if (!sd_id128_equal(machine_id, f->header->machine_id))
257 state = f->header->state;
259 if (state == STATE_ONLINE) {
260 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
262 } else if (state == STATE_ARCHIVED)
264 else if (state != STATE_OFFLINE) {
265 log_debug("Journal file %s has unknown state %u.", f->path, state);
270 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
272 f->seal = JOURNAL_HEADER_SEALED(f->header);
277 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
278 uint64_t old_size, new_size;
283 /* We assume that this file is not sparse, and we know that
284 * for sure, since we always call posix_fallocate()
288 le64toh(f->header->header_size) +
289 le64toh(f->header->arena_size);
291 new_size = PAGE_ALIGN(offset + size);
292 if (new_size < le64toh(f->header->header_size))
293 new_size = le64toh(f->header->header_size);
295 if (new_size <= old_size)
298 if (f->metrics.max_size > 0 &&
299 new_size > f->metrics.max_size)
302 if (new_size > f->metrics.min_size &&
303 f->metrics.keep_free > 0) {
306 if (fstatvfs(f->fd, &svfs) >= 0) {
309 available = svfs.f_bfree * svfs.f_bsize;
311 if (available >= f->metrics.keep_free)
312 available -= f->metrics.keep_free;
316 if (new_size - old_size > available)
321 /* Note that the glibc fallocate() fallback is very
322 inefficient, hence we try to minimize the allocation area
324 r = posix_fallocate(f->fd, old_size, new_size - old_size);
328 if (fstat(f->fd, &f->last_stat) < 0)
331 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
336 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
343 /* Avoid SIGBUS on invalid accesses */
344 if (offset + size > (uint64_t) f->last_stat.st_size) {
345 /* Hmm, out of range? Let's refresh the fstat() data
346 * first, before we trust that check. */
348 if (fstat(f->fd, &f->last_stat) < 0 ||
349 offset + size > (uint64_t) f->last_stat.st_size)
350 return -EADDRNOTAVAIL;
353 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
356 static uint64_t minimum_header_size(Object *o) {
358 static uint64_t table[] = {
359 [OBJECT_DATA] = sizeof(DataObject),
360 [OBJECT_FIELD] = sizeof(FieldObject),
361 [OBJECT_ENTRY] = sizeof(EntryObject),
362 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
363 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
364 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
365 [OBJECT_TAG] = sizeof(TagObject),
368 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
369 return sizeof(ObjectHeader);
371 return table[o->object.type];
374 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
384 /* Objects may only be located at multiple of 64 bit */
385 if (!VALID64(offset))
388 /* One context for each type, plus one catch-all for the rest */
389 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
391 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
396 s = le64toh(o->object.size);
398 if (s < sizeof(ObjectHeader))
401 if (o->object.type <= OBJECT_UNUSED)
404 if (s < minimum_header_size(o))
407 if (type > 0 && o->object.type != type)
410 if (s > sizeof(ObjectHeader)) {
411 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
422 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
427 r = le64toh(f->header->tail_entry_seqnum) + 1;
430 /* If an external seqnum counter was passed, we update
431 * both the local and the external one, and set it to
432 * the maximum of both */
440 f->header->tail_entry_seqnum = htole64(r);
442 if (f->header->head_entry_seqnum == 0)
443 f->header->head_entry_seqnum = htole64(r);
448 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
455 assert(type > 0 && type < _OBJECT_TYPE_MAX);
456 assert(size >= sizeof(ObjectHeader));
460 p = le64toh(f->header->tail_object_offset);
462 p = le64toh(f->header->header_size);
464 r = journal_file_move_to_object(f, -1, p, &tail);
468 p += ALIGN64(le64toh(tail->object.size));
471 r = journal_file_allocate(f, p, size);
475 r = journal_file_move_to(f, type, false, p, size, &t);
482 o->object.type = type;
483 o->object.size = htole64(size);
485 f->header->tail_object_offset = htole64(p);
486 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
494 static int journal_file_setup_data_hash_table(JournalFile *f) {
501 /* We estimate that we need 1 hash table entry per 768 of
502 journal file and we want to make sure we never get beyond
503 75% fill level. Calculate the hash table size for the
504 maximum file size based on these metrics. */
506 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
507 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
508 s = DEFAULT_DATA_HASH_TABLE_SIZE;
510 log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
512 r = journal_file_append_object(f,
513 OBJECT_DATA_HASH_TABLE,
514 offsetof(Object, hash_table.items) + s,
519 memset(o->hash_table.items, 0, s);
521 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
522 f->header->data_hash_table_size = htole64(s);
527 static int journal_file_setup_field_hash_table(JournalFile *f) {
534 /* We use a fixed size hash table for the fields as this
535 * number should grow very slowly only */
537 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
538 r = journal_file_append_object(f,
539 OBJECT_FIELD_HASH_TABLE,
540 offsetof(Object, hash_table.items) + s,
545 memset(o->hash_table.items, 0, s);
547 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
548 f->header->field_hash_table_size = htole64(s);
553 static int journal_file_map_data_hash_table(JournalFile *f) {
560 p = le64toh(f->header->data_hash_table_offset);
561 s = le64toh(f->header->data_hash_table_size);
563 r = journal_file_move_to(f,
564 OBJECT_DATA_HASH_TABLE,
571 f->data_hash_table = t;
575 static int journal_file_map_field_hash_table(JournalFile *f) {
582 p = le64toh(f->header->field_hash_table_offset);
583 s = le64toh(f->header->field_hash_table_size);
585 r = journal_file_move_to(f,
586 OBJECT_FIELD_HASH_TABLE,
593 f->field_hash_table = t;
597 static int journal_file_link_field(
610 if (o->object.type != OBJECT_FIELD)
613 /* This might alter the window we are looking at */
615 o->field.next_hash_offset = o->field.head_data_offset = 0;
617 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
618 p = le64toh(f->field_hash_table[h].tail_hash_offset);
620 f->field_hash_table[h].head_hash_offset = htole64(offset);
622 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
626 o->field.next_hash_offset = htole64(offset);
629 f->field_hash_table[h].tail_hash_offset = htole64(offset);
631 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
632 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
637 static int journal_file_link_data(
650 if (o->object.type != OBJECT_DATA)
653 /* This might alter the window we are looking at */
655 o->data.next_hash_offset = o->data.next_field_offset = 0;
656 o->data.entry_offset = o->data.entry_array_offset = 0;
657 o->data.n_entries = 0;
659 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
660 p = le64toh(f->data_hash_table[h].tail_hash_offset);
662 /* Only entry in the hash table is easy */
663 f->data_hash_table[h].head_hash_offset = htole64(offset);
665 /* Move back to the previous data object, to patch in
668 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
672 o->data.next_hash_offset = htole64(offset);
675 f->data_hash_table[h].tail_hash_offset = htole64(offset);
677 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
678 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
683 int journal_file_find_field_object_with_hash(
685 const void *field, uint64_t size, uint64_t hash,
686 Object **ret, uint64_t *offset) {
688 uint64_t p, osize, h;
692 assert(field && size > 0);
694 osize = offsetof(Object, field.payload) + size;
696 if (f->header->field_hash_table_size == 0)
699 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
700 p = le64toh(f->field_hash_table[h].head_hash_offset);
705 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
709 if (le64toh(o->field.hash) == hash &&
710 le64toh(o->object.size) == osize &&
711 memcmp(o->field.payload, field, size) == 0) {
721 p = le64toh(o->field.next_hash_offset);
727 int journal_file_find_field_object(
729 const void *field, uint64_t size,
730 Object **ret, uint64_t *offset) {
735 assert(field && size > 0);
737 hash = hash64(field, size);
739 return journal_file_find_field_object_with_hash(f,
744 int journal_file_find_data_object_with_hash(
746 const void *data, uint64_t size, uint64_t hash,
747 Object **ret, uint64_t *offset) {
749 uint64_t p, osize, h;
753 assert(data || size == 0);
755 osize = offsetof(Object, data.payload) + size;
757 if (f->header->data_hash_table_size == 0)
760 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
761 p = le64toh(f->data_hash_table[h].head_hash_offset);
766 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
770 if (le64toh(o->data.hash) != hash)
773 if (o->object.flags & OBJECT_COMPRESSED) {
777 l = le64toh(o->object.size);
778 if (l <= offsetof(Object, data.payload))
781 l -= offsetof(Object, data.payload);
783 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
787 memcmp(f->compress_buffer, data, size) == 0) {
798 return -EPROTONOSUPPORT;
801 } else if (le64toh(o->object.size) == osize &&
802 memcmp(o->data.payload, data, size) == 0) {
814 p = le64toh(o->data.next_hash_offset);
820 int journal_file_find_data_object(
822 const void *data, uint64_t size,
823 Object **ret, uint64_t *offset) {
828 assert(data || size == 0);
830 hash = hash64(data, size);
832 return journal_file_find_data_object_with_hash(f,
837 static int journal_file_append_field(
839 const void *field, uint64_t size,
840 Object **ret, uint64_t *offset) {
848 assert(field && size > 0);
850 hash = hash64(field, size);
852 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
866 osize = offsetof(Object, field.payload) + size;
867 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
869 o->field.hash = htole64(hash);
870 memcpy(o->field.payload, field, size);
872 r = journal_file_link_field(f, o, p, hash);
876 /* The linking might have altered the window, so let's
877 * refresh our pointer */
878 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
883 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
897 static int journal_file_append_data(
899 const void *data, uint64_t size,
900 Object **ret, uint64_t *offset) {
906 bool compressed = false;
910 assert(data || size == 0);
912 hash = hash64(data, size);
914 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
928 osize = offsetof(Object, data.payload) + size;
929 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
933 o->data.hash = htole64(hash);
937 size >= COMPRESSION_SIZE_THRESHOLD) {
940 compressed = compress_blob(data, size, o->data.payload, &rsize);
943 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
944 o->object.flags |= OBJECT_COMPRESSED;
946 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
951 if (!compressed && size > 0)
952 memcpy(o->data.payload, data, size);
954 r = journal_file_link_data(f, o, p, hash);
958 /* The linking might have altered the window, so let's
959 * refresh our pointer */
960 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
964 eq = memchr(data, '=', size);
965 if (eq && eq > data) {
969 /* Create field object ... */
970 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
974 /* ... and link it in. */
975 o->data.next_field_offset = fo->field.head_data_offset;
976 fo->field.head_data_offset = le64toh(p);
980 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
994 uint64_t journal_file_entry_n_items(Object *o) {
997 if (o->object.type != OBJECT_ENTRY)
1000 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1003 uint64_t journal_file_entry_array_n_items(Object *o) {
1006 if (o->object.type != OBJECT_ENTRY_ARRAY)
1009 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1012 uint64_t journal_file_hash_table_n_items(Object *o) {
1015 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1016 o->object.type != OBJECT_FIELD_HASH_TABLE)
1019 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1022 static int link_entry_into_array(JournalFile *f,
1027 uint64_t n = 0, ap = 0, q, i, a, hidx;
1035 a = le64toh(*first);
1036 i = hidx = le64toh(*idx);
1039 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1043 n = journal_file_entry_array_n_items(o);
1045 o->entry_array.items[i] = htole64(p);
1046 *idx = htole64(hidx + 1);
1052 a = le64toh(o->entry_array.next_entry_array_offset);
1063 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1064 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1070 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1075 o->entry_array.items[i] = htole64(p);
1078 *first = htole64(q);
1080 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1084 o->entry_array.next_entry_array_offset = htole64(q);
1087 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1088 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1090 *idx = htole64(hidx + 1);
1095 static int link_entry_into_array_plus_one(JournalFile *f,
1110 *extra = htole64(p);
1114 i = htole64(le64toh(*idx) - 1);
1115 r = link_entry_into_array(f, first, &i, p);
1120 *idx = htole64(le64toh(*idx) + 1);
1124 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1131 p = le64toh(o->entry.items[i].object_offset);
1135 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1139 return link_entry_into_array_plus_one(f,
1140 &o->data.entry_offset,
1141 &o->data.entry_array_offset,
1146 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1154 if (o->object.type != OBJECT_ENTRY)
1157 __sync_synchronize();
1159 /* Link up the entry itself */
1160 r = link_entry_into_array(f,
1161 &f->header->entry_array_offset,
1162 &f->header->n_entries,
1167 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
1169 if (f->header->head_entry_realtime == 0)
1170 f->header->head_entry_realtime = o->entry.realtime;
1172 f->header->tail_entry_realtime = o->entry.realtime;
1173 f->header->tail_entry_monotonic = o->entry.monotonic;
1175 f->tail_entry_monotonic_valid = true;
1177 /* Link up the items */
1178 n = journal_file_entry_n_items(o);
1179 for (i = 0; i < n; i++) {
1180 r = journal_file_link_entry_item(f, o, offset, i);
1188 static int journal_file_append_entry_internal(
1190 const dual_timestamp *ts,
1192 const EntryItem items[], unsigned n_items,
1194 Object **ret, uint64_t *offset) {
1201 assert(items || n_items == 0);
1204 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1206 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1210 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1211 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1212 o->entry.realtime = htole64(ts->realtime);
1213 o->entry.monotonic = htole64(ts->monotonic);
1214 o->entry.xor_hash = htole64(xor_hash);
1215 o->entry.boot_id = f->header->boot_id;
1218 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1223 r = journal_file_link_entry(f, o, np);
1236 void journal_file_post_change(JournalFile *f) {
1239 /* inotify() does not receive IN_MODIFY events from file
1240 * accesses done via mmap(). After each access we hence
1241 * trigger IN_MODIFY by truncating the journal file to its
1242 * current size which triggers IN_MODIFY. */
1244 __sync_synchronize();
1246 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1247 log_error("Failed to truncate file to its own size: %m");
1250 static int entry_item_cmp(const void *_a, const void *_b) {
1251 const EntryItem *a = _a, *b = _b;
1253 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1255 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1260 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1264 uint64_t xor_hash = 0;
1265 struct dual_timestamp _ts;
1268 assert(iovec || n_iovec == 0);
1274 dual_timestamp_get(&_ts);
1278 if (f->tail_entry_monotonic_valid &&
1279 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1283 r = journal_file_maybe_append_tag(f, ts->realtime);
1288 /* alloca() can't take 0, hence let's allocate at least one */
1289 items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1291 for (i = 0; i < n_iovec; i++) {
1295 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1299 xor_hash ^= le64toh(o->data.hash);
1300 items[i].object_offset = htole64(p);
1301 items[i].hash = o->data.hash;
1304 /* Order by the position on disk, in order to improve seek
1305 * times for rotating media. */
1306 qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1308 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1310 journal_file_post_change(f);
1315 typedef struct ChainCacheItem {
1316 uint64_t first; /* the array at the begin of the chain */
1317 uint64_t array; /* the cached array */
1318 uint64_t begin; /* the first item in the cached array */
1319 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1322 static void chain_cache_put(
1331 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1332 ci = hashmap_steal_first(h);
1334 ci = new(ChainCacheItem, 1);
1341 if (hashmap_put(h, &ci->first, ci) < 0) {
1346 assert(ci->first == first);
1353 static int generic_array_get(JournalFile *f,
1356 Object **ret, uint64_t *offset) {
1359 uint64_t p = 0, a, t = 0;
1367 /* Try the chain cache first */
1368 ci = hashmap_get(f->chain_cache, &first);
1369 if (ci && i > ci->total) {
1378 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1382 k = journal_file_entry_array_n_items(o);
1384 p = le64toh(o->entry_array.items[i]);
1390 a = le64toh(o->entry_array.next_entry_array_offset);
1396 /* Let's cache this item for the next invocation */
1397 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
1399 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1412 static int generic_array_get_plus_one(JournalFile *f,
1416 Object **ret, uint64_t *offset) {
1425 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1438 return generic_array_get(f, first, i-1, ret, offset);
1447 static int generic_array_bisect(JournalFile *f,
1451 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1452 direction_t direction,
1457 uint64_t a, p, t = 0, i = 0, last_p = 0;
1458 bool subtract_one = false;
1459 Object *o, *array = NULL;
1464 assert(test_object);
1466 /* Start with the first array in the chain */
1469 ci = hashmap_get(f->chain_cache, &first);
1470 if (ci && n > ci->total) {
1471 /* Ah, we have iterated this bisection array chain
1472 * previously! Let's see if we can skip ahead in the
1473 * chain, as far as the last time. But we can't jump
1474 * backwards in the chain, so let's check that
1477 r = test_object(f, ci->begin, needle);
1481 if (r == TEST_LEFT) {
1482 /* OK, what we are looking for is right of th
1483 * begin of this EntryArray, so let's jump
1484 * straight to previously cached array in the
1494 uint64_t left, right, k, lp;
1496 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1500 k = journal_file_entry_array_n_items(array);
1506 lp = p = le64toh(array->entry_array.items[i]);
1510 r = test_object(f, p, needle);
1514 if (r == TEST_FOUND)
1515 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1517 if (r == TEST_RIGHT) {
1521 if (left == right) {
1522 if (direction == DIRECTION_UP)
1523 subtract_one = true;
1529 assert(left < right);
1531 i = (left + right) / 2;
1532 p = le64toh(array->entry_array.items[i]);
1536 r = test_object(f, p, needle);
1540 if (r == TEST_FOUND)
1541 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1543 if (r == TEST_RIGHT)
1551 if (direction == DIRECTION_UP) {
1553 subtract_one = true;
1564 a = le64toh(array->entry_array.next_entry_array_offset);
1570 if (subtract_one && t == 0 && i == 0)
1573 /* Let's cache this item for the next invocation */
1574 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1576 if (subtract_one && i == 0)
1578 else if (subtract_one)
1579 p = le64toh(array->entry_array.items[i-1]);
1581 p = le64toh(array->entry_array.items[i]);
1583 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1594 *idx = t + i + (subtract_one ? -1 : 0);
1599 static int generic_array_bisect_plus_one(JournalFile *f,
1604 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1605 direction_t direction,
1611 bool step_back = false;
1615 assert(test_object);
1620 /* This bisects the array in object 'first', but first checks
1622 r = test_object(f, extra, needle);
1626 if (r == TEST_FOUND)
1627 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1629 /* if we are looking with DIRECTION_UP then we need to first
1630 see if in the actual array there is a matching entry, and
1631 return the last one of that. But if there isn't any we need
1632 to return this one. Hence remember this, and return it
1635 step_back = direction == DIRECTION_UP;
1637 if (r == TEST_RIGHT) {
1638 if (direction == DIRECTION_DOWN)
1644 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1646 if (r == 0 && step_back)
1655 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1671 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1677 else if (p < needle)
1683 int journal_file_move_to_entry_by_offset(
1686 direction_t direction,
1690 return generic_array_bisect(f,
1691 le64toh(f->header->entry_array_offset),
1692 le64toh(f->header->n_entries),
1700 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1707 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1711 if (le64toh(o->entry.seqnum) == needle)
1713 else if (le64toh(o->entry.seqnum) < needle)
1719 int journal_file_move_to_entry_by_seqnum(
1722 direction_t direction,
1726 return generic_array_bisect(f,
1727 le64toh(f->header->entry_array_offset),
1728 le64toh(f->header->n_entries),
1735 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1742 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1746 if (le64toh(o->entry.realtime) == needle)
1748 else if (le64toh(o->entry.realtime) < needle)
1754 int journal_file_move_to_entry_by_realtime(
1757 direction_t direction,
1761 return generic_array_bisect(f,
1762 le64toh(f->header->entry_array_offset),
1763 le64toh(f->header->n_entries),
1765 test_object_realtime,
1770 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1777 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1781 if (le64toh(o->entry.monotonic) == needle)
1783 else if (le64toh(o->entry.monotonic) < needle)
1789 int journal_file_move_to_entry_by_monotonic(
1793 direction_t direction,
1797 char t[9+32+1] = "_BOOT_ID=";
1803 sd_id128_to_string(boot_id, t + 9);
1804 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1810 return generic_array_bisect_plus_one(f,
1811 le64toh(o->data.entry_offset),
1812 le64toh(o->data.entry_array_offset),
1813 le64toh(o->data.n_entries),
1815 test_object_monotonic,
1820 int journal_file_next_entry(
1822 Object *o, uint64_t p,
1823 direction_t direction,
1824 Object **ret, uint64_t *offset) {
1830 assert(p > 0 || !o);
1832 n = le64toh(f->header->n_entries);
1837 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1839 if (o->object.type != OBJECT_ENTRY)
1842 r = generic_array_bisect(f,
1843 le64toh(f->header->entry_array_offset),
1844 le64toh(f->header->n_entries),
1853 if (direction == DIRECTION_DOWN) {
1866 /* And jump to it */
1867 return generic_array_get(f,
1868 le64toh(f->header->entry_array_offset),
1873 int journal_file_skip_entry(
1875 Object *o, uint64_t p,
1877 Object **ret, uint64_t *offset) {
1886 if (o->object.type != OBJECT_ENTRY)
1889 r = generic_array_bisect(f,
1890 le64toh(f->header->entry_array_offset),
1891 le64toh(f->header->n_entries),
1900 /* Calculate new index */
1902 if ((uint64_t) -skip >= i)
1905 i = i - (uint64_t) -skip;
1907 i += (uint64_t) skip;
1909 n = le64toh(f->header->n_entries);
1916 return generic_array_get(f,
1917 le64toh(f->header->entry_array_offset),
1922 int journal_file_next_entry_for_data(
1924 Object *o, uint64_t p,
1925 uint64_t data_offset,
1926 direction_t direction,
1927 Object **ret, uint64_t *offset) {
1934 assert(p > 0 || !o);
1936 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1940 n = le64toh(d->data.n_entries);
1945 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1947 if (o->object.type != OBJECT_ENTRY)
1950 r = generic_array_bisect_plus_one(f,
1951 le64toh(d->data.entry_offset),
1952 le64toh(d->data.entry_array_offset),
1953 le64toh(d->data.n_entries),
1963 if (direction == DIRECTION_DOWN) {
1977 return generic_array_get_plus_one(f,
1978 le64toh(d->data.entry_offset),
1979 le64toh(d->data.entry_array_offset),
1984 int journal_file_move_to_entry_by_offset_for_data(
1986 uint64_t data_offset,
1988 direction_t direction,
1989 Object **ret, uint64_t *offset) {
1996 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2000 return generic_array_bisect_plus_one(f,
2001 le64toh(d->data.entry_offset),
2002 le64toh(d->data.entry_array_offset),
2003 le64toh(d->data.n_entries),
2010 int journal_file_move_to_entry_by_monotonic_for_data(
2012 uint64_t data_offset,
2015 direction_t direction,
2016 Object **ret, uint64_t *offset) {
2018 char t[9+32+1] = "_BOOT_ID=";
2025 /* First, seek by time */
2026 sd_id128_to_string(boot_id, t + 9);
2027 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
2033 r = generic_array_bisect_plus_one(f,
2034 le64toh(o->data.entry_offset),
2035 le64toh(o->data.entry_array_offset),
2036 le64toh(o->data.n_entries),
2038 test_object_monotonic,
2044 /* And now, continue seeking until we find an entry that
2045 * exists in both bisection arrays */
2051 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2055 r = generic_array_bisect_plus_one(f,
2056 le64toh(d->data.entry_offset),
2057 le64toh(d->data.entry_array_offset),
2058 le64toh(d->data.n_entries),
2066 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2070 r = generic_array_bisect_plus_one(f,
2071 le64toh(o->data.entry_offset),
2072 le64toh(o->data.entry_array_offset),
2073 le64toh(o->data.n_entries),
2097 int journal_file_move_to_entry_by_seqnum_for_data(
2099 uint64_t data_offset,
2101 direction_t direction,
2102 Object **ret, uint64_t *offset) {
2109 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2113 return generic_array_bisect_plus_one(f,
2114 le64toh(d->data.entry_offset),
2115 le64toh(d->data.entry_array_offset),
2116 le64toh(d->data.n_entries),
2123 int journal_file_move_to_entry_by_realtime_for_data(
2125 uint64_t data_offset,
2127 direction_t direction,
2128 Object **ret, uint64_t *offset) {
2135 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2139 return generic_array_bisect_plus_one(f,
2140 le64toh(d->data.entry_offset),
2141 le64toh(d->data.entry_array_offset),
2142 le64toh(d->data.n_entries),
2144 test_object_realtime,
2149 void journal_file_dump(JournalFile *f) {
2156 journal_file_print_header(f);
2158 p = le64toh(f->header->header_size);
2160 r = journal_file_move_to_object(f, -1, p, &o);
2164 switch (o->object.type) {
2167 printf("Type: OBJECT_UNUSED\n");
2171 printf("Type: OBJECT_DATA\n");
2175 printf("Type: OBJECT_FIELD\n");
2179 printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
2180 (unsigned long long) le64toh(o->entry.seqnum),
2181 (unsigned long long) le64toh(o->entry.monotonic),
2182 (unsigned long long) le64toh(o->entry.realtime));
2185 case OBJECT_FIELD_HASH_TABLE:
2186 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2189 case OBJECT_DATA_HASH_TABLE:
2190 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2193 case OBJECT_ENTRY_ARRAY:
2194 printf("Type: OBJECT_ENTRY_ARRAY\n");
2198 printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
2199 (unsigned long long) le64toh(o->tag.seqnum),
2200 (unsigned long long) le64toh(o->tag.epoch));
2204 printf("Type: unknown (%u)\n", o->object.type);
2208 if (o->object.flags & OBJECT_COMPRESSED)
2209 printf("Flags: COMPRESSED\n");
2211 if (p == le64toh(f->header->tail_object_offset))
2214 p = p + ALIGN64(le64toh(o->object.size));
2219 log_error("File corrupt");
2222 void journal_file_print_header(JournalFile *f) {
2223 char a[33], b[33], c[33];
2224 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2226 char bytes[FORMAT_BYTES_MAX];
2230 printf("File Path: %s\n"
2234 "Sequential Number ID: %s\n"
2236 "Compatible Flags:%s%s\n"
2237 "Incompatible Flags:%s%s\n"
2238 "Header size: %llu\n"
2239 "Arena size: %llu\n"
2240 "Data Hash Table Size: %llu\n"
2241 "Field Hash Table Size: %llu\n"
2242 "Rotate Suggested: %s\n"
2243 "Head Sequential Number: %llu\n"
2244 "Tail Sequential Number: %llu\n"
2245 "Head Realtime Timestamp: %s\n"
2246 "Tail Realtime Timestamp: %s\n"
2248 "Entry Objects: %llu\n",
2250 sd_id128_to_string(f->header->file_id, a),
2251 sd_id128_to_string(f->header->machine_id, b),
2252 sd_id128_to_string(f->header->boot_id, c),
2253 sd_id128_to_string(f->header->seqnum_id, c),
2254 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2255 f->header->state == STATE_ONLINE ? "ONLINE" :
2256 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2257 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2258 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2259 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2260 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2261 (unsigned long long) le64toh(f->header->header_size),
2262 (unsigned long long) le64toh(f->header->arena_size),
2263 (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2264 (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2265 yes_no(journal_file_rotate_suggested(f, 0)),
2266 (unsigned long long) le64toh(f->header->head_entry_seqnum),
2267 (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2268 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2269 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2270 (unsigned long long) le64toh(f->header->n_objects),
2271 (unsigned long long) le64toh(f->header->n_entries));
2273 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2274 printf("Data Objects: %llu\n"
2275 "Data Hash Table Fill: %.1f%%\n",
2276 (unsigned long long) le64toh(f->header->n_data),
2277 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2279 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2280 printf("Field Objects: %llu\n"
2281 "Field Hash Table Fill: %.1f%%\n",
2282 (unsigned long long) le64toh(f->header->n_fields),
2283 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2285 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2286 printf("Tag Objects: %llu\n",
2287 (unsigned long long) le64toh(f->header->n_tags));
2288 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2289 printf("Entry Array Objects: %llu\n",
2290 (unsigned long long) le64toh(f->header->n_entry_arrays));
2292 if (fstat(f->fd, &st) >= 0)
2293 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2296 int journal_file_open(
2302 JournalMetrics *metrics,
2303 MMapCache *mmap_cache,
2304 JournalFile *template,
2305 JournalFile **ret) {
2309 bool newly_created = false;
2314 if ((flags & O_ACCMODE) != O_RDONLY &&
2315 (flags & O_ACCMODE) != O_RDWR)
2318 if (!endswith(fname, ".journal") &&
2319 !endswith(fname, ".journal~"))
2322 f = new0(JournalFile, 1);
2330 f->prot = prot_from_flags(flags);
2331 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2333 f->compress = compress;
2340 f->mmap = mmap_cache_ref(mmap_cache);
2342 f->mmap = mmap_cache_new();
2349 f->path = strdup(fname);
2355 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2356 if (!f->chain_cache) {
2361 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2367 if (fstat(f->fd, &f->last_stat) < 0) {
2372 if (f->last_stat.st_size == 0 && f->writable) {
2376 /* Let's attach the creation time to the journal file,
2377 * so that the vacuuming code knows the age of this
2378 * file even if the file might end up corrupted one
2379 * day... Ideally we'd just use the creation time many
2380 * file systems maintain for each file, but there is
2381 * currently no usable API to query this, hence let's
2382 * emulate this via extended attributes. If extended
2383 * attributes are not supported we'll just skip this,
2384 * and rely solely on mtime/atime/ctime of the file.*/
2386 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2387 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2391 /* Try to load the FSPRG state, and if we can't, then
2392 * just don't do sealing */
2394 r = journal_file_fss_load(f);
2400 r = journal_file_init_header(f, template);
2404 if (fstat(f->fd, &f->last_stat) < 0) {
2409 newly_created = true;
2412 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2417 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2418 if (f->header == MAP_FAILED) {
2424 if (!newly_created) {
2425 r = journal_file_verify_header(f);
2431 if (!newly_created && f->writable) {
2432 r = journal_file_fss_load(f);
2440 journal_default_metrics(metrics, f->fd);
2441 f->metrics = *metrics;
2442 } else if (template)
2443 f->metrics = template->metrics;
2445 r = journal_file_refresh_header(f);
2451 r = journal_file_hmac_setup(f);
2456 if (newly_created) {
2457 r = journal_file_setup_field_hash_table(f);
2461 r = journal_file_setup_data_hash_table(f);
2466 r = journal_file_append_first_tag(f);
2472 r = journal_file_map_field_hash_table(f);
2476 r = journal_file_map_data_hash_table(f);
2484 journal_file_close(f);
2489 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2492 JournalFile *old_file, *new_file = NULL;
2500 if (!old_file->writable)
2503 if (!endswith(old_file->path, ".journal"))
2506 l = strlen(old_file->path);
2508 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2512 memcpy(p, old_file->path, l - 8);
2514 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2515 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2516 "-%016llx-%016llx.journal",
2517 (unsigned long long) le64toh((*f)->header->head_entry_seqnum),
2518 (unsigned long long) le64toh((*f)->header->head_entry_realtime));
2520 r = rename(old_file->path, p);
2526 old_file->header->state = STATE_ARCHIVED;
2528 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2529 journal_file_close(old_file);
2535 int journal_file_open_reliably(
2541 JournalMetrics *metrics,
2542 MMapCache *mmap_cache,
2543 JournalFile *template,
2544 JournalFile **ret) {
2550 r = journal_file_open(fname, flags, mode, compress, seal,
2551 metrics, mmap_cache, template, ret);
2552 if (r != -EBADMSG && /* corrupted */
2553 r != -ENODATA && /* truncated */
2554 r != -EHOSTDOWN && /* other machine */
2555 r != -EPROTONOSUPPORT && /* incompatible feature */
2556 r != -EBUSY && /* unclean shutdown */
2557 r != -ESHUTDOWN /* already archived */)
2560 if ((flags & O_ACCMODE) == O_RDONLY)
2563 if (!(flags & O_CREAT))
2566 if (!endswith(fname, ".journal"))
2569 /* The file is corrupted. Rotate it away and try it again (but only once) */
2572 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2574 (unsigned long long) now(CLOCK_REALTIME),
2578 r = rename(fname, p);
2583 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2585 return journal_file_open(fname, flags, mode, compress, seal,
2586 metrics, mmap_cache, template, ret);
2590 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2592 uint64_t q, xor_hash = 0;
2605 ts.monotonic = le64toh(o->entry.monotonic);
2606 ts.realtime = le64toh(o->entry.realtime);
2608 if (to->tail_entry_monotonic_valid &&
2609 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2612 n = journal_file_entry_n_items(o);
2613 items = alloca(sizeof(EntryItem) * n);
2615 for (i = 0; i < n; i++) {
2622 q = le64toh(o->entry.items[i].object_offset);
2623 le_hash = o->entry.items[i].hash;
2625 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2629 if (le_hash != o->data.hash)
2632 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2635 /* We hit the limit on 32bit machines */
2636 if ((uint64_t) t != l)
2639 if (o->object.flags & OBJECT_COMPRESSED) {
2643 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2646 data = from->compress_buffer;
2649 return -EPROTONOSUPPORT;
2652 data = o->data.payload;
2654 r = journal_file_append_data(to, data, l, &u, &h);
2658 xor_hash ^= le64toh(u->data.hash);
2659 items[i].object_offset = htole64(h);
2660 items[i].hash = u->data.hash;
2662 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2667 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2670 void journal_default_metrics(JournalMetrics *m, int fd) {
2671 uint64_t fs_size = 0;
2673 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2678 if (fstatvfs(fd, &ss) >= 0)
2679 fs_size = ss.f_frsize * ss.f_blocks;
2681 if (m->max_use == (uint64_t) -1) {
2684 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2686 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2687 m->max_use = DEFAULT_MAX_USE_UPPER;
2689 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2690 m->max_use = DEFAULT_MAX_USE_LOWER;
2692 m->max_use = DEFAULT_MAX_USE_LOWER;
2694 m->max_use = PAGE_ALIGN(m->max_use);
2696 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2697 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2700 if (m->max_size == (uint64_t) -1) {
2701 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2703 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2704 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2706 m->max_size = PAGE_ALIGN(m->max_size);
2708 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2709 m->max_size = JOURNAL_FILE_SIZE_MIN;
2711 if (m->max_size*2 > m->max_use)
2712 m->max_use = m->max_size*2;
2714 if (m->min_size == (uint64_t) -1)
2715 m->min_size = JOURNAL_FILE_SIZE_MIN;
2717 m->min_size = PAGE_ALIGN(m->min_size);
2719 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2720 m->min_size = JOURNAL_FILE_SIZE_MIN;
2722 if (m->min_size > m->max_size)
2723 m->max_size = m->min_size;
2726 if (m->keep_free == (uint64_t) -1) {
2729 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2731 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2732 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2735 m->keep_free = DEFAULT_KEEP_FREE;
2738 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2739 format_bytes(a, sizeof(a), m->max_use),
2740 format_bytes(b, sizeof(b), m->max_size),
2741 format_bytes(c, sizeof(c), m->min_size),
2742 format_bytes(d, sizeof(d), m->keep_free));
2745 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2750 if (f->header->head_entry_realtime == 0)
2753 *from = le64toh(f->header->head_entry_realtime);
2757 if (f->header->tail_entry_realtime == 0)
2760 *to = le64toh(f->header->tail_entry_realtime);
2766 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2767 char t[9+32+1] = "_BOOT_ID=";
2775 sd_id128_to_string(boot_id, t + 9);
2777 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2781 if (le64toh(o->data.n_entries) <= 0)
2785 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2789 *from = le64toh(o->entry.monotonic);
2793 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2797 r = generic_array_get_plus_one(f,
2798 le64toh(o->data.entry_offset),
2799 le64toh(o->data.entry_array_offset),
2800 le64toh(o->data.n_entries)-1,
2805 *to = le64toh(o->entry.monotonic);
2811 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2814 /* If we gained new header fields we gained new features,
2815 * hence suggest a rotation */
2816 if (le64toh(f->header->header_size) < sizeof(Header)) {
2817 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2821 /* Let's check if the hash tables grew over a certain fill
2822 * level (75%, borrowing this value from Java's hash table
2823 * implementation), and if so suggest a rotation. To calculate
2824 * the fill level we need the n_data field, which only exists
2825 * in newer versions. */
2827 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2828 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2829 log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2831 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2832 (unsigned long long) le64toh(f->header->n_data),
2833 (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2834 (unsigned long long) (f->last_stat.st_size),
2835 (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2839 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2840 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2841 log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2843 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2844 (unsigned long long) le64toh(f->header->n_fields),
2845 (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2849 /* Are the data objects properly indexed by field objects? */
2850 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2851 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2852 le64toh(f->header->n_data) > 0 &&
2853 le64toh(f->header->n_fields) == 0)
2856 if (max_file_usec > 0) {
2859 h = le64toh(f->header->head_entry_realtime);
2860 t = now(CLOCK_REALTIME);
2862 if (h > 0 && t > h + max_file_usec)