1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
31 #include <attr/xattr.h>
34 #include "journal-def.h"
35 #include "journal-file.h"
36 #include "journal-authenticate.h"
41 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
44 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
46 /* This is the minimum journal file size */
47 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
49 /* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54 /* This is the upper bound if we deduce max_size from max_use */
55 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
57 /* This is the upper bound if we deduce the keep_free value from the
59 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61 /* This is the keep_free value when we can't determine the system
63 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65 /* n_data was the first entry we added after the initial file format design */
66 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
68 void journal_file_close(JournalFile *f) {
72 /* Write the final tag */
73 if (f->seal && f->writable)
74 journal_file_append_tag(f);
77 /* Sync everything to disk, before we mark the file offline */
78 if (f->mmap && f->fd >= 0)
79 mmap_cache_close_fd(f->mmap, f->fd);
81 if (f->writable && f->fd >= 0)
85 /* Mark the file offline. Don't override the archived state if it already is set */
86 if (f->writable && f->header->state == STATE_ONLINE)
87 f->header->state = STATE_OFFLINE;
89 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
93 close_nointr_nofail(f->fd);
98 mmap_cache_unref(f->mmap);
101 free(f->compress_buffer);
106 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
107 else if (f->fsprg_state)
108 free(f->fsprg_state);
113 gcry_md_close(f->hmac);
119 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
127 memcpy(h.signature, HEADER_SIGNATURE, 8);
128 h.header_size = htole64(ALIGN64(sizeof(h)));
130 h.incompatible_flags =
131 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
134 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
136 r = sd_id128_randomize(&h.file_id);
141 h.seqnum_id = template->header->seqnum_id;
142 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
144 h.seqnum_id = h.file_id;
146 k = pwrite(f->fd, &h, sizeof(h), 0);
156 static int journal_file_refresh_header(JournalFile *f) {
162 r = sd_id128_get_machine(&f->header->machine_id);
166 r = sd_id128_get_boot(&boot_id);
170 if (sd_id128_equal(boot_id, f->header->boot_id))
171 f->tail_entry_monotonic_valid = true;
173 f->header->boot_id = boot_id;
175 f->header->state = STATE_ONLINE;
177 /* Sync the online state to disk */
178 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
184 static int journal_file_verify_header(JournalFile *f) {
187 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
190 /* In both read and write mode we refuse to open files with
191 * incompatible flags we don't know */
193 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
194 return -EPROTONOSUPPORT;
196 if (f->header->incompatible_flags != 0)
197 return -EPROTONOSUPPORT;
200 /* When open for writing we refuse to open files with
201 * compatible flags, too */
204 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
205 return -EPROTONOSUPPORT;
207 if (f->header->compatible_flags != 0)
208 return -EPROTONOSUPPORT;
212 if (f->header->state >= _STATE_MAX)
215 /* The first addition was n_data, so check that we are at least this large */
216 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
219 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
222 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
225 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
228 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
229 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
230 !VALID64(le64toh(f->header->tail_object_offset)) ||
231 !VALID64(le64toh(f->header->entry_array_offset)))
234 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
235 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
236 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
237 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
242 sd_id128_t machine_id;
245 r = sd_id128_get_machine(&machine_id);
249 if (!sd_id128_equal(machine_id, f->header->machine_id))
252 state = f->header->state;
254 if (state == STATE_ONLINE) {
255 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
257 } else if (state == STATE_ARCHIVED)
259 else if (state != STATE_OFFLINE) {
260 log_debug("Journal file %s has unknown state %u.", f->path, state);
265 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
267 f->seal = JOURNAL_HEADER_SEALED(f->header);
272 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
273 uint64_t old_size, new_size;
278 /* We assume that this file is not sparse, and we know that
279 * for sure, since we always call posix_fallocate()
283 le64toh(f->header->header_size) +
284 le64toh(f->header->arena_size);
286 new_size = PAGE_ALIGN(offset + size);
287 if (new_size < le64toh(f->header->header_size))
288 new_size = le64toh(f->header->header_size);
290 if (new_size <= old_size)
293 if (f->metrics.max_size > 0 &&
294 new_size > f->metrics.max_size)
297 if (new_size > f->metrics.min_size &&
298 f->metrics.keep_free > 0) {
301 if (fstatvfs(f->fd, &svfs) >= 0) {
304 available = svfs.f_bfree * svfs.f_bsize;
306 if (available >= f->metrics.keep_free)
307 available -= f->metrics.keep_free;
311 if (new_size - old_size > available)
316 /* Note that the glibc fallocate() fallback is very
317 inefficient, hence we try to minimize the allocation area
319 r = posix_fallocate(f->fd, old_size, new_size - old_size);
323 if (fstat(f->fd, &f->last_stat) < 0)
326 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
331 static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
338 /* Avoid SIGBUS on invalid accesses */
339 if (offset + size > (uint64_t) f->last_stat.st_size) {
340 /* Hmm, out of range? Let's refresh the fstat() data
341 * first, before we trust that check. */
343 if (fstat(f->fd, &f->last_stat) < 0 ||
344 offset + size > (uint64_t) f->last_stat.st_size)
345 return -EADDRNOTAVAIL;
348 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
351 static uint64_t minimum_header_size(Object *o) {
353 static uint64_t table[] = {
354 [OBJECT_DATA] = sizeof(DataObject),
355 [OBJECT_FIELD] = sizeof(FieldObject),
356 [OBJECT_ENTRY] = sizeof(EntryObject),
357 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
358 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
359 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
360 [OBJECT_TAG] = sizeof(TagObject),
363 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
364 return sizeof(ObjectHeader);
366 return table[o->object.type];
369 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
379 /* Objects may only be located at multiple of 64 bit */
380 if (!VALID64(offset))
383 /* One context for each type, plus one catch-all for the rest */
384 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
386 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
391 s = le64toh(o->object.size);
393 if (s < sizeof(ObjectHeader))
396 if (o->object.type <= OBJECT_UNUSED)
399 if (s < minimum_header_size(o))
402 if (type > 0 && o->object.type != type)
405 if (s > sizeof(ObjectHeader)) {
406 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
417 static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
422 r = le64toh(f->header->tail_entry_seqnum) + 1;
425 /* If an external seqnum counter was passed, we update
426 * both the local and the external one, and set it to
427 * the maximum of both */
435 f->header->tail_entry_seqnum = htole64(r);
437 if (f->header->head_entry_seqnum == 0)
438 f->header->head_entry_seqnum = htole64(r);
443 int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
450 assert(type > 0 && type < _OBJECT_TYPE_MAX);
451 assert(size >= sizeof(ObjectHeader));
455 p = le64toh(f->header->tail_object_offset);
457 p = le64toh(f->header->header_size);
459 r = journal_file_move_to_object(f, -1, p, &tail);
463 p += ALIGN64(le64toh(tail->object.size));
466 r = journal_file_allocate(f, p, size);
470 r = journal_file_move_to(f, type, false, p, size, &t);
477 o->object.type = type;
478 o->object.size = htole64(size);
480 f->header->tail_object_offset = htole64(p);
481 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
489 static int journal_file_setup_data_hash_table(JournalFile *f) {
496 /* We estimate that we need 1 hash table entry per 768 of
497 journal file and we want to make sure we never get beyond
498 75% fill level. Calculate the hash table size for the
499 maximum file size based on these metrics. */
501 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
502 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
503 s = DEFAULT_DATA_HASH_TABLE_SIZE;
505 log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
507 r = journal_file_append_object(f,
508 OBJECT_DATA_HASH_TABLE,
509 offsetof(Object, hash_table.items) + s,
514 memset(o->hash_table.items, 0, s);
516 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
517 f->header->data_hash_table_size = htole64(s);
522 static int journal_file_setup_field_hash_table(JournalFile *f) {
529 /* We use a fixed size hash table for the fields as this
530 * number should grow very slowly only */
532 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
533 r = journal_file_append_object(f,
534 OBJECT_FIELD_HASH_TABLE,
535 offsetof(Object, hash_table.items) + s,
540 memset(o->hash_table.items, 0, s);
542 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
543 f->header->field_hash_table_size = htole64(s);
548 static int journal_file_map_data_hash_table(JournalFile *f) {
555 p = le64toh(f->header->data_hash_table_offset);
556 s = le64toh(f->header->data_hash_table_size);
558 r = journal_file_move_to(f,
559 OBJECT_DATA_HASH_TABLE,
566 f->data_hash_table = t;
570 static int journal_file_map_field_hash_table(JournalFile *f) {
577 p = le64toh(f->header->field_hash_table_offset);
578 s = le64toh(f->header->field_hash_table_size);
580 r = journal_file_move_to(f,
581 OBJECT_FIELD_HASH_TABLE,
588 f->field_hash_table = t;
592 static int journal_file_link_field(
605 if (o->object.type != OBJECT_FIELD)
608 /* This might alter the window we are looking at */
610 o->field.next_hash_offset = o->field.head_data_offset = 0;
612 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
613 p = le64toh(f->field_hash_table[h].tail_hash_offset);
615 f->field_hash_table[h].head_hash_offset = htole64(offset);
617 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
621 o->field.next_hash_offset = htole64(offset);
624 f->field_hash_table[h].tail_hash_offset = htole64(offset);
626 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
627 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
632 static int journal_file_link_data(
645 if (o->object.type != OBJECT_DATA)
648 /* This might alter the window we are looking at */
650 o->data.next_hash_offset = o->data.next_field_offset = 0;
651 o->data.entry_offset = o->data.entry_array_offset = 0;
652 o->data.n_entries = 0;
654 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
655 p = le64toh(f->data_hash_table[h].tail_hash_offset);
657 /* Only entry in the hash table is easy */
658 f->data_hash_table[h].head_hash_offset = htole64(offset);
660 /* Move back to the previous data object, to patch in
663 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
667 o->data.next_hash_offset = htole64(offset);
670 f->data_hash_table[h].tail_hash_offset = htole64(offset);
672 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
673 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
678 int journal_file_find_field_object_with_hash(
680 const void *field, uint64_t size, uint64_t hash,
681 Object **ret, uint64_t *offset) {
683 uint64_t p, osize, h;
687 assert(field && size > 0);
689 osize = offsetof(Object, field.payload) + size;
691 if (f->header->field_hash_table_size == 0)
694 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
695 p = le64toh(f->field_hash_table[h].head_hash_offset);
700 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
704 if (le64toh(o->field.hash) == hash &&
705 le64toh(o->object.size) == osize &&
706 memcmp(o->field.payload, field, size) == 0) {
716 p = le64toh(o->field.next_hash_offset);
722 int journal_file_find_field_object(
724 const void *field, uint64_t size,
725 Object **ret, uint64_t *offset) {
730 assert(field && size > 0);
732 hash = hash64(field, size);
734 return journal_file_find_field_object_with_hash(f,
739 int journal_file_find_data_object_with_hash(
741 const void *data, uint64_t size, uint64_t hash,
742 Object **ret, uint64_t *offset) {
744 uint64_t p, osize, h;
748 assert(data || size == 0);
750 osize = offsetof(Object, data.payload) + size;
752 if (f->header->data_hash_table_size == 0)
755 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
756 p = le64toh(f->data_hash_table[h].head_hash_offset);
761 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
765 if (le64toh(o->data.hash) != hash)
768 if (o->object.flags & OBJECT_COMPRESSED) {
772 l = le64toh(o->object.size);
773 if (l <= offsetof(Object, data.payload))
776 l -= offsetof(Object, data.payload);
778 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
782 memcmp(f->compress_buffer, data, size) == 0) {
793 return -EPROTONOSUPPORT;
796 } else if (le64toh(o->object.size) == osize &&
797 memcmp(o->data.payload, data, size) == 0) {
809 p = le64toh(o->data.next_hash_offset);
815 int journal_file_find_data_object(
817 const void *data, uint64_t size,
818 Object **ret, uint64_t *offset) {
823 assert(data || size == 0);
825 hash = hash64(data, size);
827 return journal_file_find_data_object_with_hash(f,
832 static int journal_file_append_field(
834 const void *field, uint64_t size,
835 Object **ret, uint64_t *offset) {
843 assert(field && size > 0);
845 hash = hash64(field, size);
847 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
861 osize = offsetof(Object, field.payload) + size;
862 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
864 o->field.hash = htole64(hash);
865 memcpy(o->field.payload, field, size);
867 r = journal_file_link_field(f, o, p, hash);
871 /* The linking might have altered the window, so let's
872 * refresh our pointer */
873 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
878 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
892 static int journal_file_append_data(
894 const void *data, uint64_t size,
895 Object **ret, uint64_t *offset) {
901 bool compressed = false;
905 assert(data || size == 0);
907 hash = hash64(data, size);
909 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
923 osize = offsetof(Object, data.payload) + size;
924 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
928 o->data.hash = htole64(hash);
932 size >= COMPRESSION_SIZE_THRESHOLD) {
935 compressed = compress_blob(data, size, o->data.payload, &rsize);
938 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
939 o->object.flags |= OBJECT_COMPRESSED;
941 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
946 if (!compressed && size > 0)
947 memcpy(o->data.payload, data, size);
949 r = journal_file_link_data(f, o, p, hash);
953 /* The linking might have altered the window, so let's
954 * refresh our pointer */
955 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
959 eq = memchr(data, '=', size);
960 if (eq && eq > data) {
964 /* Create field object ... */
965 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
969 /* ... and link it in. */
970 o->data.next_field_offset = fo->field.head_data_offset;
971 fo->field.head_data_offset = le64toh(p);
975 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
989 uint64_t journal_file_entry_n_items(Object *o) {
992 if (o->object.type != OBJECT_ENTRY)
995 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
998 uint64_t journal_file_entry_array_n_items(Object *o) {
1001 if (o->object.type != OBJECT_ENTRY_ARRAY)
1004 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1007 uint64_t journal_file_hash_table_n_items(Object *o) {
1010 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1011 o->object.type != OBJECT_FIELD_HASH_TABLE)
1014 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1017 static int link_entry_into_array(JournalFile *f,
1022 uint64_t n = 0, ap = 0, q, i, a, hidx;
1030 a = le64toh(*first);
1031 i = hidx = le64toh(*idx);
1034 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1038 n = journal_file_entry_array_n_items(o);
1040 o->entry_array.items[i] = htole64(p);
1041 *idx = htole64(hidx + 1);
1047 a = le64toh(o->entry_array.next_entry_array_offset);
1058 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1059 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1065 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1070 o->entry_array.items[i] = htole64(p);
1073 *first = htole64(q);
1075 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1079 o->entry_array.next_entry_array_offset = htole64(q);
1082 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1083 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1085 *idx = htole64(hidx + 1);
1090 static int link_entry_into_array_plus_one(JournalFile *f,
1105 *extra = htole64(p);
1109 i = htole64(le64toh(*idx) - 1);
1110 r = link_entry_into_array(f, first, &i, p);
1115 *idx = htole64(le64toh(*idx) + 1);
1119 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1126 p = le64toh(o->entry.items[i].object_offset);
1130 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1134 return link_entry_into_array_plus_one(f,
1135 &o->data.entry_offset,
1136 &o->data.entry_array_offset,
1141 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1149 if (o->object.type != OBJECT_ENTRY)
1152 __sync_synchronize();
1154 /* Link up the entry itself */
1155 r = link_entry_into_array(f,
1156 &f->header->entry_array_offset,
1157 &f->header->n_entries,
1162 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
1164 if (f->header->head_entry_realtime == 0)
1165 f->header->head_entry_realtime = o->entry.realtime;
1167 f->header->tail_entry_realtime = o->entry.realtime;
1168 f->header->tail_entry_monotonic = o->entry.monotonic;
1170 f->tail_entry_monotonic_valid = true;
1172 /* Link up the items */
1173 n = journal_file_entry_n_items(o);
1174 for (i = 0; i < n; i++) {
1175 r = journal_file_link_entry_item(f, o, offset, i);
1183 static int journal_file_append_entry_internal(
1185 const dual_timestamp *ts,
1187 const EntryItem items[], unsigned n_items,
1189 Object **ret, uint64_t *offset) {
1196 assert(items || n_items == 0);
1199 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1201 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1205 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1206 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
1207 o->entry.realtime = htole64(ts->realtime);
1208 o->entry.monotonic = htole64(ts->monotonic);
1209 o->entry.xor_hash = htole64(xor_hash);
1210 o->entry.boot_id = f->header->boot_id;
1213 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1218 r = journal_file_link_entry(f, o, np);
1231 void journal_file_post_change(JournalFile *f) {
1234 /* inotify() does not receive IN_MODIFY events from file
1235 * accesses done via mmap(). After each access we hence
1236 * trigger IN_MODIFY by truncating the journal file to its
1237 * current size which triggers IN_MODIFY. */
1239 __sync_synchronize();
1241 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1242 log_error("Failed to truncate file to its own size: %m");
1245 static int entry_item_cmp(const void *_a, const void *_b) {
1246 const EntryItem *a = _a, *b = _b;
1248 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1250 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1255 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1259 uint64_t xor_hash = 0;
1260 struct dual_timestamp _ts;
1263 assert(iovec || n_iovec == 0);
1269 dual_timestamp_get(&_ts);
1273 if (f->tail_entry_monotonic_valid &&
1274 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1278 r = journal_file_maybe_append_tag(f, ts->realtime);
1283 /* alloca() can't take 0, hence let's allocate at least one */
1284 items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
1286 for (i = 0; i < n_iovec; i++) {
1290 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1294 xor_hash ^= le64toh(o->data.hash);
1295 items[i].object_offset = htole64(p);
1296 items[i].hash = o->data.hash;
1299 /* Order by the position on disk, in order to improve seek
1300 * times for rotating media. */
1301 qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1303 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1305 journal_file_post_change(f);
1310 static int generic_array_get(JournalFile *f,
1313 Object **ret, uint64_t *offset) {
1325 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1329 n = journal_file_entry_array_n_items(o);
1331 p = le64toh(o->entry_array.items[i]);
1336 a = le64toh(o->entry_array.next_entry_array_offset);
1339 if (a <= 0 || p <= 0)
1342 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1355 static int generic_array_get_plus_one(JournalFile *f,
1359 Object **ret, uint64_t *offset) {
1368 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1381 return generic_array_get(f, first, i-1, ret, offset);
1390 static int generic_array_bisect(JournalFile *f,
1394 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1395 direction_t direction,
1400 uint64_t a, p, t = 0, i = 0, last_p = 0;
1401 bool subtract_one = false;
1402 Object *o, *array = NULL;
1406 assert(test_object);
1410 uint64_t left, right, k, lp;
1412 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1416 k = journal_file_entry_array_n_items(array);
1422 lp = p = le64toh(array->entry_array.items[i]);
1426 r = test_object(f, p, needle);
1430 if (r == TEST_FOUND)
1431 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1433 if (r == TEST_RIGHT) {
1437 if (left == right) {
1438 if (direction == DIRECTION_UP)
1439 subtract_one = true;
1445 assert(left < right);
1447 i = (left + right) / 2;
1448 p = le64toh(array->entry_array.items[i]);
1452 r = test_object(f, p, needle);
1456 if (r == TEST_FOUND)
1457 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1459 if (r == TEST_RIGHT)
1467 if (direction == DIRECTION_UP) {
1469 subtract_one = true;
1480 a = le64toh(array->entry_array.next_entry_array_offset);
1486 if (subtract_one && t == 0 && i == 0)
1489 if (subtract_one && i == 0)
1491 else if (subtract_one)
1492 p = le64toh(array->entry_array.items[i-1]);
1494 p = le64toh(array->entry_array.items[i]);
1496 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1507 *idx = t + i + (subtract_one ? -1 : 0);
1512 static int generic_array_bisect_plus_one(JournalFile *f,
1517 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1518 direction_t direction,
1524 bool step_back = false;
1528 assert(test_object);
1533 /* This bisects the array in object 'first', but first checks
1535 r = test_object(f, extra, needle);
1539 if (r == TEST_FOUND)
1540 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1542 /* if we are looking with DIRECTION_UP then we need to first
1543 see if in the actual array there is a matching entry, and
1544 return the last one of that. But if there isn't any we need
1545 to return this one. Hence remember this, and return it
1548 step_back = direction == DIRECTION_UP;
1550 if (r == TEST_RIGHT) {
1551 if (direction == DIRECTION_DOWN)
1557 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1559 if (r == 0 && step_back)
1568 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1584 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1590 else if (p < needle)
1596 int journal_file_move_to_entry_by_offset(
1599 direction_t direction,
1603 return generic_array_bisect(f,
1604 le64toh(f->header->entry_array_offset),
1605 le64toh(f->header->n_entries),
1613 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1620 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1624 if (le64toh(o->entry.seqnum) == needle)
1626 else if (le64toh(o->entry.seqnum) < needle)
1632 int journal_file_move_to_entry_by_seqnum(
1635 direction_t direction,
1639 return generic_array_bisect(f,
1640 le64toh(f->header->entry_array_offset),
1641 le64toh(f->header->n_entries),
1648 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1655 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1659 if (le64toh(o->entry.realtime) == needle)
1661 else if (le64toh(o->entry.realtime) < needle)
1667 int journal_file_move_to_entry_by_realtime(
1670 direction_t direction,
1674 return generic_array_bisect(f,
1675 le64toh(f->header->entry_array_offset),
1676 le64toh(f->header->n_entries),
1678 test_object_realtime,
1683 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1690 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1694 if (le64toh(o->entry.monotonic) == needle)
1696 else if (le64toh(o->entry.monotonic) < needle)
1702 int journal_file_move_to_entry_by_monotonic(
1706 direction_t direction,
1710 char t[9+32+1] = "_BOOT_ID=";
1716 sd_id128_to_string(boot_id, t + 9);
1717 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1723 return generic_array_bisect_plus_one(f,
1724 le64toh(o->data.entry_offset),
1725 le64toh(o->data.entry_array_offset),
1726 le64toh(o->data.n_entries),
1728 test_object_monotonic,
1733 int journal_file_next_entry(
1735 Object *o, uint64_t p,
1736 direction_t direction,
1737 Object **ret, uint64_t *offset) {
1743 assert(p > 0 || !o);
1745 n = le64toh(f->header->n_entries);
1750 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1752 if (o->object.type != OBJECT_ENTRY)
1755 r = generic_array_bisect(f,
1756 le64toh(f->header->entry_array_offset),
1757 le64toh(f->header->n_entries),
1766 if (direction == DIRECTION_DOWN) {
1779 /* And jump to it */
1780 return generic_array_get(f,
1781 le64toh(f->header->entry_array_offset),
1786 int journal_file_skip_entry(
1788 Object *o, uint64_t p,
1790 Object **ret, uint64_t *offset) {
1799 if (o->object.type != OBJECT_ENTRY)
1802 r = generic_array_bisect(f,
1803 le64toh(f->header->entry_array_offset),
1804 le64toh(f->header->n_entries),
1813 /* Calculate new index */
1815 if ((uint64_t) -skip >= i)
1818 i = i - (uint64_t) -skip;
1820 i += (uint64_t) skip;
1822 n = le64toh(f->header->n_entries);
1829 return generic_array_get(f,
1830 le64toh(f->header->entry_array_offset),
1835 int journal_file_next_entry_for_data(
1837 Object *o, uint64_t p,
1838 uint64_t data_offset,
1839 direction_t direction,
1840 Object **ret, uint64_t *offset) {
1847 assert(p > 0 || !o);
1849 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1853 n = le64toh(d->data.n_entries);
1858 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1860 if (o->object.type != OBJECT_ENTRY)
1863 r = generic_array_bisect_plus_one(f,
1864 le64toh(d->data.entry_offset),
1865 le64toh(d->data.entry_array_offset),
1866 le64toh(d->data.n_entries),
1876 if (direction == DIRECTION_DOWN) {
1890 return generic_array_get_plus_one(f,
1891 le64toh(d->data.entry_offset),
1892 le64toh(d->data.entry_array_offset),
1897 int journal_file_move_to_entry_by_offset_for_data(
1899 uint64_t data_offset,
1901 direction_t direction,
1902 Object **ret, uint64_t *offset) {
1909 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1913 return generic_array_bisect_plus_one(f,
1914 le64toh(d->data.entry_offset),
1915 le64toh(d->data.entry_array_offset),
1916 le64toh(d->data.n_entries),
1923 int journal_file_move_to_entry_by_monotonic_for_data(
1925 uint64_t data_offset,
1928 direction_t direction,
1929 Object **ret, uint64_t *offset) {
1931 char t[9+32+1] = "_BOOT_ID=";
1938 /* First, seek by time */
1939 sd_id128_to_string(boot_id, t + 9);
1940 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1946 r = generic_array_bisect_plus_one(f,
1947 le64toh(o->data.entry_offset),
1948 le64toh(o->data.entry_array_offset),
1949 le64toh(o->data.n_entries),
1951 test_object_monotonic,
1957 /* And now, continue seeking until we find an entry that
1958 * exists in both bisection arrays */
1964 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1968 r = generic_array_bisect_plus_one(f,
1969 le64toh(d->data.entry_offset),
1970 le64toh(d->data.entry_array_offset),
1971 le64toh(d->data.n_entries),
1979 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1983 r = generic_array_bisect_plus_one(f,
1984 le64toh(o->data.entry_offset),
1985 le64toh(o->data.entry_array_offset),
1986 le64toh(o->data.n_entries),
2010 int journal_file_move_to_entry_by_seqnum_for_data(
2012 uint64_t data_offset,
2014 direction_t direction,
2015 Object **ret, uint64_t *offset) {
2022 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2026 return generic_array_bisect_plus_one(f,
2027 le64toh(d->data.entry_offset),
2028 le64toh(d->data.entry_array_offset),
2029 le64toh(d->data.n_entries),
2036 int journal_file_move_to_entry_by_realtime_for_data(
2038 uint64_t data_offset,
2040 direction_t direction,
2041 Object **ret, uint64_t *offset) {
2048 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2052 return generic_array_bisect_plus_one(f,
2053 le64toh(d->data.entry_offset),
2054 le64toh(d->data.entry_array_offset),
2055 le64toh(d->data.n_entries),
2057 test_object_realtime,
2062 void journal_file_dump(JournalFile *f) {
2069 journal_file_print_header(f);
2071 p = le64toh(f->header->header_size);
2073 r = journal_file_move_to_object(f, -1, p, &o);
2077 switch (o->object.type) {
2080 printf("Type: OBJECT_UNUSED\n");
2084 printf("Type: OBJECT_DATA\n");
2088 printf("Type: OBJECT_FIELD\n");
2092 printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
2093 (unsigned long long) le64toh(o->entry.seqnum),
2094 (unsigned long long) le64toh(o->entry.monotonic),
2095 (unsigned long long) le64toh(o->entry.realtime));
2098 case OBJECT_FIELD_HASH_TABLE:
2099 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2102 case OBJECT_DATA_HASH_TABLE:
2103 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2106 case OBJECT_ENTRY_ARRAY:
2107 printf("Type: OBJECT_ENTRY_ARRAY\n");
2111 printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
2112 (unsigned long long) le64toh(o->tag.seqnum),
2113 (unsigned long long) le64toh(o->tag.epoch));
2117 printf("Type: unknown (%u)\n", o->object.type);
2121 if (o->object.flags & OBJECT_COMPRESSED)
2122 printf("Flags: COMPRESSED\n");
2124 if (p == le64toh(f->header->tail_object_offset))
2127 p = p + ALIGN64(le64toh(o->object.size));
2132 log_error("File corrupt");
2135 void journal_file_print_header(JournalFile *f) {
2136 char a[33], b[33], c[33];
2137 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
2139 char bytes[FORMAT_BYTES_MAX];
2143 printf("File Path: %s\n"
2147 "Sequential Number ID: %s\n"
2149 "Compatible Flags:%s%s\n"
2150 "Incompatible Flags:%s%s\n"
2151 "Header size: %llu\n"
2152 "Arena size: %llu\n"
2153 "Data Hash Table Size: %llu\n"
2154 "Field Hash Table Size: %llu\n"
2155 "Rotate Suggested: %s\n"
2156 "Head Sequential Number: %llu\n"
2157 "Tail Sequential Number: %llu\n"
2158 "Head Realtime Timestamp: %s\n"
2159 "Tail Realtime Timestamp: %s\n"
2161 "Entry Objects: %llu\n",
2163 sd_id128_to_string(f->header->file_id, a),
2164 sd_id128_to_string(f->header->machine_id, b),
2165 sd_id128_to_string(f->header->boot_id, c),
2166 sd_id128_to_string(f->header->seqnum_id, c),
2167 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2168 f->header->state == STATE_ONLINE ? "ONLINE" :
2169 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
2170 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2171 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2172 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2173 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
2174 (unsigned long long) le64toh(f->header->header_size),
2175 (unsigned long long) le64toh(f->header->arena_size),
2176 (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2177 (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
2178 yes_no(journal_file_rotate_suggested(f, 0)),
2179 (unsigned long long) le64toh(f->header->head_entry_seqnum),
2180 (unsigned long long) le64toh(f->header->tail_entry_seqnum),
2181 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2182 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
2183 (unsigned long long) le64toh(f->header->n_objects),
2184 (unsigned long long) le64toh(f->header->n_entries));
2186 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2187 printf("Data Objects: %llu\n"
2188 "Data Hash Table Fill: %.1f%%\n",
2189 (unsigned long long) le64toh(f->header->n_data),
2190 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
2192 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2193 printf("Field Objects: %llu\n"
2194 "Field Hash Table Fill: %.1f%%\n",
2195 (unsigned long long) le64toh(f->header->n_fields),
2196 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
2198 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2199 printf("Tag Objects: %llu\n",
2200 (unsigned long long) le64toh(f->header->n_tags));
2201 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2202 printf("Entry Array Objects: %llu\n",
2203 (unsigned long long) le64toh(f->header->n_entry_arrays));
2205 if (fstat(f->fd, &st) >= 0)
2206 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
2209 int journal_file_open(
2215 JournalMetrics *metrics,
2216 MMapCache *mmap_cache,
2217 JournalFile *template,
2218 JournalFile **ret) {
2222 bool newly_created = false;
2227 if ((flags & O_ACCMODE) != O_RDONLY &&
2228 (flags & O_ACCMODE) != O_RDWR)
2231 if (!endswith(fname, ".journal") &&
2232 !endswith(fname, ".journal~"))
2235 f = new0(JournalFile, 1);
2243 f->prot = prot_from_flags(flags);
2244 f->writable = (flags & O_ACCMODE) != O_RDONLY;
2246 f->compress = compress;
2253 f->mmap = mmap_cache_ref(mmap_cache);
2255 f->mmap = mmap_cache_new();
2262 f->path = strdup(fname);
2268 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2274 if (fstat(f->fd, &f->last_stat) < 0) {
2279 if (f->last_stat.st_size == 0 && f->writable) {
2283 /* Let's attach the creation time to the journal file,
2284 * so that the vacuuming code knows the age of this
2285 * file even if the file might end up corrupted one
2286 * day... Ideally we'd just use the creation time many
2287 * file systems maintain for each file, but there is
2288 * currently no usable API to query this, hence let's
2289 * emulate this via extended attributes. If extended
2290 * attributes are not supported we'll just skip this,
2291 * and rely solely on mtime/atime/ctime of the file.*/
2293 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2294 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2298 /* Try to load the FSPRG state, and if we can't, then
2299 * just don't do sealing */
2301 r = journal_file_fss_load(f);
2307 r = journal_file_init_header(f, template);
2311 if (fstat(f->fd, &f->last_stat) < 0) {
2316 newly_created = true;
2319 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2324 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2325 if (f->header == MAP_FAILED) {
2331 if (!newly_created) {
2332 r = journal_file_verify_header(f);
2338 if (!newly_created && f->writable) {
2339 r = journal_file_fss_load(f);
2347 journal_default_metrics(metrics, f->fd);
2348 f->metrics = *metrics;
2349 } else if (template)
2350 f->metrics = template->metrics;
2352 r = journal_file_refresh_header(f);
2358 r = journal_file_hmac_setup(f);
2363 if (newly_created) {
2364 r = journal_file_setup_field_hash_table(f);
2368 r = journal_file_setup_data_hash_table(f);
2373 r = journal_file_append_first_tag(f);
2379 r = journal_file_map_field_hash_table(f);
2383 r = journal_file_map_data_hash_table(f);
2391 journal_file_close(f);
2396 int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
2399 JournalFile *old_file, *new_file = NULL;
2407 if (!old_file->writable)
2410 if (!endswith(old_file->path, ".journal"))
2413 l = strlen(old_file->path);
2415 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2419 memcpy(p, old_file->path, l - 8);
2421 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2422 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2423 "-%016llx-%016llx.journal",
2424 (unsigned long long) le64toh((*f)->header->head_entry_seqnum),
2425 (unsigned long long) le64toh((*f)->header->head_entry_realtime));
2427 r = rename(old_file->path, p);
2433 old_file->header->state = STATE_ARCHIVED;
2435 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
2436 journal_file_close(old_file);
2442 int journal_file_open_reliably(
2448 JournalMetrics *metrics,
2449 MMapCache *mmap_cache,
2450 JournalFile *template,
2451 JournalFile **ret) {
2457 r = journal_file_open(fname, flags, mode, compress, seal,
2458 metrics, mmap_cache, template, ret);
2459 if (r != -EBADMSG && /* corrupted */
2460 r != -ENODATA && /* truncated */
2461 r != -EHOSTDOWN && /* other machine */
2462 r != -EPROTONOSUPPORT && /* incompatible feature */
2463 r != -EBUSY && /* unclean shutdown */
2464 r != -ESHUTDOWN /* already archived */)
2467 if ((flags & O_ACCMODE) == O_RDONLY)
2470 if (!(flags & O_CREAT))
2473 if (!endswith(fname, ".journal"))
2476 /* The file is corrupted. Rotate it away and try it again (but only once) */
2479 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2481 (unsigned long long) now(CLOCK_REALTIME),
2485 r = rename(fname, p);
2490 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
2492 return journal_file_open(fname, flags, mode, compress, seal,
2493 metrics, mmap_cache, template, ret);
2497 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2499 uint64_t q, xor_hash = 0;
2512 ts.monotonic = le64toh(o->entry.monotonic);
2513 ts.realtime = le64toh(o->entry.realtime);
2515 if (to->tail_entry_monotonic_valid &&
2516 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2519 n = journal_file_entry_n_items(o);
2520 items = alloca(sizeof(EntryItem) * n);
2522 for (i = 0; i < n; i++) {
2529 q = le64toh(o->entry.items[i].object_offset);
2530 le_hash = o->entry.items[i].hash;
2532 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2536 if (le_hash != o->data.hash)
2539 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2542 /* We hit the limit on 32bit machines */
2543 if ((uint64_t) t != l)
2546 if (o->object.flags & OBJECT_COMPRESSED) {
2550 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2553 data = from->compress_buffer;
2556 return -EPROTONOSUPPORT;
2559 data = o->data.payload;
2561 r = journal_file_append_data(to, data, l, &u, &h);
2565 xor_hash ^= le64toh(u->data.hash);
2566 items[i].object_offset = htole64(h);
2567 items[i].hash = u->data.hash;
2569 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2574 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2577 void journal_default_metrics(JournalMetrics *m, int fd) {
2578 uint64_t fs_size = 0;
2580 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2585 if (fstatvfs(fd, &ss) >= 0)
2586 fs_size = ss.f_frsize * ss.f_blocks;
2588 if (m->max_use == (uint64_t) -1) {
2591 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2593 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2594 m->max_use = DEFAULT_MAX_USE_UPPER;
2596 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2597 m->max_use = DEFAULT_MAX_USE_LOWER;
2599 m->max_use = DEFAULT_MAX_USE_LOWER;
2601 m->max_use = PAGE_ALIGN(m->max_use);
2603 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2604 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2607 if (m->max_size == (uint64_t) -1) {
2608 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2610 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2611 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2613 m->max_size = PAGE_ALIGN(m->max_size);
2615 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2616 m->max_size = JOURNAL_FILE_SIZE_MIN;
2618 if (m->max_size*2 > m->max_use)
2619 m->max_use = m->max_size*2;
2621 if (m->min_size == (uint64_t) -1)
2622 m->min_size = JOURNAL_FILE_SIZE_MIN;
2624 m->min_size = PAGE_ALIGN(m->min_size);
2626 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2627 m->min_size = JOURNAL_FILE_SIZE_MIN;
2629 if (m->min_size > m->max_size)
2630 m->max_size = m->min_size;
2633 if (m->keep_free == (uint64_t) -1) {
2636 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2638 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2639 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2642 m->keep_free = DEFAULT_KEEP_FREE;
2645 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2646 format_bytes(a, sizeof(a), m->max_use),
2647 format_bytes(b, sizeof(b), m->max_size),
2648 format_bytes(c, sizeof(c), m->min_size),
2649 format_bytes(d, sizeof(d), m->keep_free));
2652 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2657 if (f->header->head_entry_realtime == 0)
2660 *from = le64toh(f->header->head_entry_realtime);
2664 if (f->header->tail_entry_realtime == 0)
2667 *to = le64toh(f->header->tail_entry_realtime);
2673 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2674 char t[9+32+1] = "_BOOT_ID=";
2682 sd_id128_to_string(boot_id, t + 9);
2684 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2688 if (le64toh(o->data.n_entries) <= 0)
2692 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2696 *from = le64toh(o->entry.monotonic);
2700 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2704 r = generic_array_get_plus_one(f,
2705 le64toh(o->data.entry_offset),
2706 le64toh(o->data.entry_array_offset),
2707 le64toh(o->data.n_entries)-1,
2712 *to = le64toh(o->entry.monotonic);
2718 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
2721 /* If we gained new header fields we gained new features,
2722 * hence suggest a rotation */
2723 if (le64toh(f->header->header_size) < sizeof(Header)) {
2724 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
2728 /* Let's check if the hash tables grew over a certain fill
2729 * level (75%, borrowing this value from Java's hash table
2730 * implementation), and if so suggest a rotation. To calculate
2731 * the fill level we need the n_data field, which only exists
2732 * in newer versions. */
2734 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2735 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2736 log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2738 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2739 (unsigned long long) le64toh(f->header->n_data),
2740 (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2741 (unsigned long long) (f->last_stat.st_size),
2742 (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
2746 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2747 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2748 log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2750 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2751 (unsigned long long) le64toh(f->header->n_fields),
2752 (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
2756 /* Are the data objects properly indexed by field objects? */
2757 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2758 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2759 le64toh(f->header->n_data) > 0 &&
2760 le64toh(f->header->n_fields) == 0)
2763 if (max_file_usec > 0) {
2766 h = le64toh(f->header->head_entry_realtime);
2767 t = now(CLOCK_REALTIME);
2769 if (h > 0 && t > h + max_file_usec)