1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65 void journal_file_close(JournalFile *f) {
72 f->header->state = STATE_OFFLINE;
74 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
77 for (t = 0; t < _WINDOW_MAX; t++)
78 if (f->windows[t].ptr)
79 munmap(f->windows[t].ptr, f->windows[t].size);
82 close_nointr_nofail(f->fd);
87 free(f->compress_buffer);
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
101 memcpy(h.signature, signature, 8);
102 h.header_size = htole64(ALIGN64(sizeof(h)));
104 r = sd_id128_randomize(&h.file_id);
109 h.seqnum_id = template->header->seqnum_id;
110 h.seqnum = template->header->seqnum;
112 h.seqnum_id = h.file_id;
114 k = pwrite(f->fd, &h, sizeof(h), 0);
124 static int journal_file_refresh_header(JournalFile *f) {
130 r = sd_id128_get_machine(&f->header->machine_id);
134 r = sd_id128_get_boot(&boot_id);
138 if (sd_id128_equal(boot_id, f->header->boot_id))
139 f->tail_entry_monotonic_valid = true;
141 f->header->boot_id = boot_id;
143 f->header->state = STATE_ONLINE;
145 __sync_synchronize();
150 static int journal_file_verify_header(JournalFile *f) {
153 if (memcmp(f->header, signature, 8))
157 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158 return -EPROTONOSUPPORT;
160 if (f->header->incompatible_flags != 0)
161 return -EPROTONOSUPPORT;
164 if (f->header->header_size != htole64(ALIGN64(sizeof(*(f->header)))))
167 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
172 sd_id128_t machine_id;
175 r = sd_id128_get_machine(&machine_id);
179 if (!sd_id128_equal(machine_id, f->header->machine_id))
182 state = f->header->state;
184 if (state == STATE_ONLINE)
185 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
186 else if (state == STATE_ARCHIVED)
188 else if (state != STATE_OFFLINE)
189 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
195 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
196 uint64_t old_size, new_size;
201 /* We assume that this file is not sparse, and we know that
202 * for sure, since we always call posix_fallocate()
206 le64toh(f->header->header_size) +
207 le64toh(f->header->arena_size);
209 new_size = PAGE_ALIGN(offset + size);
210 if (new_size < le64toh(f->header->header_size))
211 new_size = le64toh(f->header->header_size);
213 if (new_size <= old_size)
216 if (f->metrics.max_size > 0 &&
217 new_size > f->metrics.max_size)
220 if (new_size > f->metrics.min_size &&
221 f->metrics.keep_free > 0) {
224 if (fstatvfs(f->fd, &svfs) >= 0) {
227 available = svfs.f_bfree * svfs.f_bsize;
229 if (available >= f->metrics.keep_free)
230 available -= f->metrics.keep_free;
234 if (new_size - old_size > available)
239 /* Note that the glibc fallocate() fallback is very
240 inefficient, hence we try to minimize the allocation area
242 r = posix_fallocate(f->fd, old_size, new_size - old_size);
246 if (fstat(f->fd, &f->last_stat) < 0)
249 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
254 static int journal_file_map(
263 uint64_t woffset, wsize;
270 woffset = offset & ~((uint64_t) page_size() - 1ULL);
271 wsize = size + (offset - woffset);
272 wsize = PAGE_ALIGN(wsize);
274 /* Avoid SIGBUS on invalid accesses */
275 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
276 return -EADDRNOTAVAIL;
278 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
279 if (window == MAP_FAILED)
291 *ret = (uint8_t*) window + (offset - woffset);
296 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
305 assert(wt < _WINDOW_MAX);
307 if (offset + size > (uint64_t) f->last_stat.st_size) {
308 /* Hmm, out of range? Let's refresh the fstat() data
309 * first, before we trust that check. */
311 if (fstat(f->fd, &f->last_stat) < 0 ||
312 offset + size > (uint64_t) f->last_stat.st_size)
313 return -EADDRNOTAVAIL;
318 if (_likely_(w->ptr &&
319 w->offset <= offset &&
320 w->offset + w->size >= offset + size)) {
322 *ret = (uint8_t*) w->ptr + (offset - w->offset);
327 if (munmap(w->ptr, w->size) < 0)
331 w->size = w->offset = 0;
334 if (size < DEFAULT_WINDOW_SIZE) {
335 /* If the default window size is larger then what was
336 * asked for extend the mapping a bit in the hope to
337 * minimize needed remappings later on. We add half
338 * the window space before and half behind the
339 * requested mapping */
341 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
347 size = DEFAULT_WINDOW_SIZE;
351 if (offset + size > (uint64_t) f->last_stat.st_size)
352 size = (uint64_t) f->last_stat.st_size - offset;
355 return -EADDRNOTAVAIL;
357 r = journal_file_map(f,
359 &w->ptr, &w->offset, &w->size,
365 *ret = (uint8_t*) p + delta;
369 static bool verify_hash(Object *o) {
374 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
375 h1 = le64toh(o->data.hash);
376 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
377 } else if (o->object.type == OBJECT_FIELD) {
378 h1 = le64toh(o->field.hash);
379 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
386 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
394 assert(type < _OBJECT_TYPE_MAX);
396 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
401 s = le64toh(o->object.size);
403 if (s < sizeof(ObjectHeader))
406 if (type >= 0 && o->object.type != type)
409 if (s > sizeof(ObjectHeader)) {
410 r = journal_file_move_to(f, o->object.type, offset, s, &t);
424 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
429 r = le64toh(f->header->seqnum) + 1;
432 /* If an external seqnum counter was passed, we update
433 * both the local and the external one, and set it to
434 * the maximum of both */
442 f->header->seqnum = htole64(r);
444 if (f->header->first_seqnum == 0)
445 f->header->first_seqnum = htole64(r);
450 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
457 assert(size >= sizeof(ObjectHeader));
461 p = le64toh(f->header->tail_object_offset);
463 p = le64toh(f->header->header_size);
465 r = journal_file_move_to_object(f, -1, p, &tail);
469 p += ALIGN64(le64toh(tail->object.size));
472 r = journal_file_allocate(f, p, size);
476 r = journal_file_move_to(f, type, p, size, &t);
483 o->object.type = type;
484 o->object.size = htole64(size);
486 f->header->tail_object_offset = htole64(p);
487 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
495 static int journal_file_setup_data_hash_table(JournalFile *f) {
502 s = DEFAULT_DATA_HASH_TABLE_SIZE;
503 r = journal_file_append_object(f,
504 OBJECT_DATA_HASH_TABLE,
505 offsetof(Object, hash_table.items) + s,
510 memset(o->hash_table.items, 0, s);
512 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
513 f->header->data_hash_table_size = htole64(s);
518 static int journal_file_setup_field_hash_table(JournalFile *f) {
525 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
526 r = journal_file_append_object(f,
527 OBJECT_FIELD_HASH_TABLE,
528 offsetof(Object, hash_table.items) + s,
533 memset(o->hash_table.items, 0, s);
535 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
536 f->header->field_hash_table_size = htole64(s);
541 static int journal_file_map_data_hash_table(JournalFile *f) {
548 p = le64toh(f->header->data_hash_table_offset);
549 s = le64toh(f->header->data_hash_table_size);
551 r = journal_file_move_to(f,
552 WINDOW_DATA_HASH_TABLE,
558 f->data_hash_table = t;
562 static int journal_file_map_field_hash_table(JournalFile *f) {
569 p = le64toh(f->header->field_hash_table_offset);
570 s = le64toh(f->header->field_hash_table_size);
572 r = journal_file_move_to(f,
573 WINDOW_FIELD_HASH_TABLE,
579 f->field_hash_table = t;
583 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
590 assert(o->object.type == OBJECT_DATA);
592 /* This might alter the window we are looking at */
594 o->data.next_hash_offset = o->data.next_field_offset = 0;
595 o->data.entry_offset = o->data.entry_array_offset = 0;
596 o->data.n_entries = 0;
598 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
599 p = le64toh(f->data_hash_table[h].tail_hash_offset);
601 /* Only entry in the hash table is easy */
602 f->data_hash_table[h].head_hash_offset = htole64(offset);
604 /* Move back to the previous data object, to patch in
607 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
611 o->data.next_hash_offset = htole64(offset);
614 f->data_hash_table[h].tail_hash_offset = htole64(offset);
619 int journal_file_find_data_object_with_hash(
621 const void *data, uint64_t size, uint64_t hash,
622 Object **ret, uint64_t *offset) {
624 uint64_t p, osize, h;
628 assert(data || size == 0);
630 osize = offsetof(Object, data.payload) + size;
632 if (f->header->data_hash_table_size == 0)
635 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
636 p = le64toh(f->data_hash_table[h].head_hash_offset);
641 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
645 if (le64toh(o->data.hash) != hash)
648 if (o->object.flags & OBJECT_COMPRESSED) {
652 l = le64toh(o->object.size);
653 if (l <= offsetof(Object, data.payload))
656 l -= offsetof(Object, data.payload);
658 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
662 memcmp(f->compress_buffer, data, size) == 0) {
673 return -EPROTONOSUPPORT;
676 } else if (le64toh(o->object.size) == osize &&
677 memcmp(o->data.payload, data, size) == 0) {
689 p = le64toh(o->data.next_hash_offset);
695 int journal_file_find_data_object(
697 const void *data, uint64_t size,
698 Object **ret, uint64_t *offset) {
703 assert(data || size == 0);
705 hash = hash64(data, size);
707 return journal_file_find_data_object_with_hash(f,
712 static int journal_file_append_data(
714 const void *data, uint64_t size,
715 Object **ret, uint64_t *offset) {
721 bool compressed = false;
724 assert(data || size == 0);
726 hash = hash64(data, size);
728 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
742 osize = offsetof(Object, data.payload) + size;
743 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
747 o->data.hash = htole64(hash);
751 size >= COMPRESSION_SIZE_THRESHOLD) {
754 compressed = compress_blob(data, size, o->data.payload, &rsize);
757 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
758 o->object.flags |= OBJECT_COMPRESSED;
760 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
762 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
768 memcpy(o->data.payload, data, size);
770 r = journal_file_link_data(f, o, p, hash);
774 /* The linking might have altered the window, so let's
775 * refresh our pointer */
776 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
789 uint64_t journal_file_entry_n_items(Object *o) {
791 assert(o->object.type == OBJECT_ENTRY);
793 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
796 static uint64_t journal_file_entry_array_n_items(Object *o) {
798 assert(o->object.type == OBJECT_ENTRY_ARRAY);
800 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
803 static int link_entry_into_array(JournalFile *f,
808 uint64_t n = 0, ap = 0, q, i, a, hidx;
817 i = hidx = le64toh(*idx);
820 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
824 n = journal_file_entry_array_n_items(o);
826 o->entry_array.items[i] = htole64(p);
827 *idx = htole64(hidx + 1);
833 a = le64toh(o->entry_array.next_entry_array_offset);
844 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
845 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
850 o->entry_array.items[i] = htole64(p);
855 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
859 o->entry_array.next_entry_array_offset = htole64(q);
862 *idx = htole64(hidx + 1);
867 static int link_entry_into_array_plus_one(JournalFile *f,
886 i = htole64(le64toh(*idx) - 1);
887 r = link_entry_into_array(f, first, &i, p);
892 *idx = htole64(le64toh(*idx) + 1);
896 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
903 p = le64toh(o->entry.items[i].object_offset);
907 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
911 return link_entry_into_array_plus_one(f,
912 &o->data.entry_offset,
913 &o->data.entry_array_offset,
918 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
925 assert(o->object.type == OBJECT_ENTRY);
927 __sync_synchronize();
929 /* Link up the entry itself */
930 r = link_entry_into_array(f,
931 &f->header->entry_array_offset,
932 &f->header->n_entries,
937 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
939 if (f->header->head_entry_realtime == 0)
940 f->header->head_entry_realtime = o->entry.realtime;
942 f->header->tail_entry_realtime = o->entry.realtime;
943 f->header->tail_entry_monotonic = o->entry.monotonic;
945 f->tail_entry_monotonic_valid = true;
947 /* Link up the items */
948 n = journal_file_entry_n_items(o);
949 for (i = 0; i < n; i++) {
950 r = journal_file_link_entry_item(f, o, offset, i);
958 static int journal_file_append_entry_internal(
960 const dual_timestamp *ts,
962 const EntryItem items[], unsigned n_items,
964 Object **ret, uint64_t *offset) {
971 assert(items || n_items == 0);
974 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
976 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
980 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
981 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
982 o->entry.realtime = htole64(ts->realtime);
983 o->entry.monotonic = htole64(ts->monotonic);
984 o->entry.xor_hash = htole64(xor_hash);
985 o->entry.boot_id = f->header->boot_id;
987 r = journal_file_link_entry(f, o, np);
1000 void journal_file_post_change(JournalFile *f) {
1003 /* inotify() does not receive IN_MODIFY events from file
1004 * accesses done via mmap(). After each access we hence
1005 * trigger IN_MODIFY by truncating the journal file to its
1006 * current size which triggers IN_MODIFY. */
1008 __sync_synchronize();
1010 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1011 log_error("Failed to to truncate file to its own size: %m");
1014 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1018 uint64_t xor_hash = 0;
1019 struct dual_timestamp _ts;
1022 assert(iovec || n_iovec == 0);
1028 dual_timestamp_get(&_ts);
1032 if (f->tail_entry_monotonic_valid &&
1033 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1036 items = alloca(sizeof(EntryItem) * n_iovec);
1038 for (i = 0; i < n_iovec; i++) {
1042 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1046 xor_hash ^= le64toh(o->data.hash);
1047 items[i].object_offset = htole64(p);
1048 items[i].hash = o->data.hash;
1051 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1053 journal_file_post_change(f);
1058 static int generic_array_get(JournalFile *f,
1061 Object **ret, uint64_t *offset) {
1073 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1077 n = journal_file_entry_array_n_items(o);
1079 p = le64toh(o->entry_array.items[i]);
1084 a = le64toh(o->entry_array.next_entry_array_offset);
1087 if (a <= 0 || p <= 0)
1090 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1103 static int generic_array_get_plus_one(JournalFile *f,
1107 Object **ret, uint64_t *offset) {
1116 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1129 return generic_array_get(f, first, i-1, ret, offset);
1138 static int generic_array_bisect(JournalFile *f,
1142 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1143 direction_t direction,
1148 uint64_t a, p, t = 0, i = 0, last_p = 0;
1149 bool subtract_one = false;
1150 Object *o, *array = NULL;
1154 assert(test_object);
1158 uint64_t left, right, k, lp;
1160 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1164 k = journal_file_entry_array_n_items(array);
1170 lp = p = le64toh(array->entry_array.items[i]);
1174 r = test_object(f, p, needle);
1178 if (r == TEST_FOUND)
1179 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1181 if (r == TEST_RIGHT) {
1185 if (left == right) {
1186 if (direction == DIRECTION_UP)
1187 subtract_one = true;
1193 assert(left < right);
1195 i = (left + right) / 2;
1196 p = le64toh(array->entry_array.items[i]);
1200 r = test_object(f, p, needle);
1204 if (r == TEST_FOUND)
1205 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1207 if (r == TEST_RIGHT)
1221 a = le64toh(array->entry_array.next_entry_array_offset);
1227 if (subtract_one && t == 0 && i == 0)
1230 if (subtract_one && i == 0)
1232 else if (subtract_one)
1233 p = le64toh(array->entry_array.items[i-1]);
1235 p = le64toh(array->entry_array.items[i]);
1237 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1248 *idx = t + i - (subtract_one ? 1 : 0);
1253 static int generic_array_bisect_plus_one(JournalFile *f,
1258 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1259 direction_t direction,
1267 assert(test_object);
1272 /* This bisects the array in object 'first', but first checks
1274 r = test_object(f, extra, needle);
1277 else if (r == TEST_FOUND) {
1280 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1294 } else if (r == TEST_RIGHT)
1297 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1305 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1312 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1316 if (le64toh(o->entry.seqnum) == needle)
1318 else if (le64toh(o->entry.seqnum) < needle)
1324 int journal_file_move_to_entry_by_seqnum(
1327 direction_t direction,
1331 return generic_array_bisect(f,
1332 le64toh(f->header->entry_array_offset),
1333 le64toh(f->header->n_entries),
1340 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1347 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1351 if (le64toh(o->entry.realtime) == needle)
1353 else if (le64toh(o->entry.realtime) < needle)
1359 int journal_file_move_to_entry_by_realtime(
1362 direction_t direction,
1366 return generic_array_bisect(f,
1367 le64toh(f->header->entry_array_offset),
1368 le64toh(f->header->n_entries),
1370 test_object_realtime,
1375 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1382 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1386 if (le64toh(o->entry.monotonic) == needle)
1388 else if (le64toh(o->entry.monotonic) < needle)
1394 int journal_file_move_to_entry_by_monotonic(
1398 direction_t direction,
1402 char t[8+32+1] = "_BOOT_ID=";
1406 sd_id128_to_string(boot_id, t + 8);
1408 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1414 return generic_array_bisect_plus_one(f,
1415 le64toh(o->data.entry_offset),
1416 le64toh(o->data.entry_array_offset),
1417 le64toh(o->data.n_entries),
1419 test_object_monotonic,
1424 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1430 else if (p < needle)
1436 int journal_file_next_entry(
1438 Object *o, uint64_t p,
1439 direction_t direction,
1440 Object **ret, uint64_t *offset) {
1446 assert(p > 0 || !o);
1448 n = le64toh(f->header->n_entries);
1453 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1455 if (o->object.type != OBJECT_ENTRY)
1458 r = generic_array_bisect(f,
1459 le64toh(f->header->entry_array_offset),
1460 le64toh(f->header->n_entries),
1469 if (direction == DIRECTION_DOWN) {
1482 /* And jump to it */
1483 return generic_array_get(f,
1484 le64toh(f->header->entry_array_offset),
1489 int journal_file_skip_entry(
1491 Object *o, uint64_t p,
1493 Object **ret, uint64_t *offset) {
1502 if (o->object.type != OBJECT_ENTRY)
1505 r = generic_array_bisect(f,
1506 le64toh(f->header->entry_array_offset),
1507 le64toh(f->header->n_entries),
1516 /* Calculate new index */
1518 if ((uint64_t) -skip >= i)
1521 i = i - (uint64_t) -skip;
1523 i += (uint64_t) skip;
1525 n = le64toh(f->header->n_entries);
1532 return generic_array_get(f,
1533 le64toh(f->header->entry_array_offset),
1538 int journal_file_next_entry_for_data(
1540 Object *o, uint64_t p,
1541 uint64_t data_offset,
1542 direction_t direction,
1543 Object **ret, uint64_t *offset) {
1550 assert(p > 0 || !o);
1552 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1556 n = le64toh(d->data.n_entries);
1561 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1563 if (o->object.type != OBJECT_ENTRY)
1566 r = generic_array_bisect_plus_one(f,
1567 le64toh(d->data.entry_offset),
1568 le64toh(d->data.entry_array_offset),
1569 le64toh(d->data.n_entries),
1579 if (direction == DIRECTION_DOWN) {
1593 return generic_array_get_plus_one(f,
1594 le64toh(d->data.entry_offset),
1595 le64toh(d->data.entry_array_offset),
1600 int journal_file_move_to_entry_by_seqnum_for_data(
1602 uint64_t data_offset,
1604 direction_t direction,
1605 Object **ret, uint64_t *offset) {
1610 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1614 return generic_array_bisect_plus_one(f,
1615 le64toh(d->data.entry_offset),
1616 le64toh(d->data.entry_array_offset),
1617 le64toh(d->data.n_entries),
1624 int journal_file_move_to_entry_by_realtime_for_data(
1626 uint64_t data_offset,
1628 direction_t direction,
1629 Object **ret, uint64_t *offset) {
1634 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1638 return generic_array_bisect_plus_one(f,
1639 le64toh(d->data.entry_offset),
1640 le64toh(d->data.entry_array_offset),
1641 le64toh(d->data.n_entries),
1643 test_object_realtime,
1648 void journal_file_dump(JournalFile *f) {
1649 char a[33], b[33], c[33];
1656 printf("File Path: %s\n"
1660 "Arena size: %llu\n"
1664 sd_id128_to_string(f->header->file_id, a),
1665 sd_id128_to_string(f->header->machine_id, b),
1666 sd_id128_to_string(f->header->boot_id, c),
1667 (unsigned long long) le64toh(f->header->arena_size),
1668 (unsigned long) le64toh(f->header->n_objects),
1669 (unsigned long) le64toh(f->header->n_entries));
1671 p = le64toh(f->header->header_size);
1673 r = journal_file_move_to_object(f, -1, p, &o);
1677 switch (o->object.type) {
1680 printf("Type: OBJECT_UNUSED\n");
1684 printf("Type: OBJECT_DATA\n");
1688 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1689 (unsigned long long) le64toh(o->entry.seqnum),
1690 (unsigned long long) le64toh(o->entry.monotonic),
1691 (unsigned long long) le64toh(o->entry.realtime));
1694 case OBJECT_FIELD_HASH_TABLE:
1695 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1698 case OBJECT_DATA_HASH_TABLE:
1699 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1702 case OBJECT_ENTRY_ARRAY:
1703 printf("Type: OBJECT_ENTRY_ARRAY\n");
1706 case OBJECT_SIGNATURE:
1707 printf("Type: OBJECT_SIGNATURE\n");
1711 if (o->object.flags & OBJECT_COMPRESSED)
1712 printf("Flags: COMPRESSED\n");
1714 if (p == le64toh(f->header->tail_object_offset))
1717 p = p + ALIGN64(le64toh(o->object.size));
1722 log_error("File corrupt");
1725 int journal_file_open(
1729 JournalFile *template,
1730 JournalFile **ret) {
1734 bool newly_created = false;
1738 if ((flags & O_ACCMODE) != O_RDONLY &&
1739 (flags & O_ACCMODE) != O_RDWR)
1742 if (!endswith(fname, ".journal"))
1745 f = new0(JournalFile, 1);
1752 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1753 f->prot = prot_from_flags(flags);
1756 f->metrics = template->metrics;
1757 f->compress = template->compress;
1760 f->path = strdup(fname);
1766 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1772 if (fstat(f->fd, &f->last_stat) < 0) {
1777 if (f->last_stat.st_size == 0 && f->writable) {
1778 newly_created = true;
1780 r = journal_file_init_header(f, template);
1784 if (fstat(f->fd, &f->last_stat) < 0) {
1790 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1795 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1796 if (f->header == MAP_FAILED) {
1802 if (!newly_created) {
1803 r = journal_file_verify_header(f);
1809 r = journal_file_refresh_header(f);
1814 if (newly_created) {
1816 r = journal_file_setup_field_hash_table(f);
1820 r = journal_file_setup_data_hash_table(f);
1825 r = journal_file_map_field_hash_table(f);
1829 r = journal_file_map_data_hash_table(f);
1839 journal_file_close(f);
1844 int journal_file_rotate(JournalFile **f) {
1847 JournalFile *old_file, *new_file = NULL;
1855 if (!old_file->writable)
1858 if (!endswith(old_file->path, ".journal"))
1861 l = strlen(old_file->path);
1863 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1867 memcpy(p, old_file->path, l - 8);
1869 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1870 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1871 "-%016llx-%016llx.journal",
1872 (unsigned long long) le64toh((*f)->header->seqnum),
1873 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1875 r = rename(old_file->path, p);
1881 old_file->header->state = STATE_ARCHIVED;
1883 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1884 journal_file_close(old_file);
1890 int journal_file_open_reliably(
1894 JournalFile *template,
1895 JournalFile **ret) {
1901 r = journal_file_open(fname, flags, mode, template, ret);
1902 if (r != -EBADMSG && /* corrupted */
1903 r != -ENODATA && /* truncated */
1904 r != -EHOSTDOWN && /* other machine */
1905 r != -EPROTONOSUPPORT) /* incompatible feature */
1908 if ((flags & O_ACCMODE) == O_RDONLY)
1911 if (!(flags & O_CREAT))
1914 /* The file is corrupted. Rotate it away and try it again (but only once) */
1917 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1919 (unsigned long long) now(CLOCK_REALTIME),
1923 r = rename(fname, p);
1928 log_warning("File %s corrupted, renaming and replacing.", fname);
1930 return journal_file_open(fname, flags, mode, template, ret);
1933 struct vacuum_info {
1938 sd_id128_t seqnum_id;
1944 static int vacuum_compare(const void *_a, const void *_b) {
1945 const struct vacuum_info *a, *b;
1950 if (a->have_seqnum && b->have_seqnum &&
1951 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1952 if (a->seqnum < b->seqnum)
1954 else if (a->seqnum > b->seqnum)
1960 if (a->realtime < b->realtime)
1962 else if (a->realtime > b->realtime)
1964 else if (a->have_seqnum && b->have_seqnum)
1965 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1967 return strcmp(a->filename, b->filename);
1970 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1973 struct vacuum_info *list = NULL;
1974 unsigned n_list = 0, n_allocated = 0, i;
1982 d = opendir(directory);
1988 struct dirent buf, *de;
1992 unsigned long long seqnum = 0, realtime;
1993 sd_id128_t seqnum_id;
1996 k = readdir_r(d, &buf, &de);
2005 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2008 if (!S_ISREG(st.st_mode))
2011 q = strlen(de->d_name);
2013 if (endswith(de->d_name, ".journal")) {
2015 /* Vacuum archived files */
2017 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2020 if (de->d_name[q-8-16-1] != '-' ||
2021 de->d_name[q-8-16-1-16-1] != '-' ||
2022 de->d_name[q-8-16-1-16-1-32-1] != '@')
2025 p = strdup(de->d_name);
2031 de->d_name[q-8-16-1-16-1] = 0;
2032 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2037 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2044 } else if (endswith(de->d_name, ".journal~")) {
2045 unsigned long long tmp;
2047 /* Vacuum corrupted files */
2049 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2052 if (de->d_name[q-1-8-16-1] != '-' ||
2053 de->d_name[q-1-8-16-1-16-1] != '@')
2056 p = strdup(de->d_name);
2062 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2067 have_seqnum = false;
2071 if (n_list >= n_allocated) {
2072 struct vacuum_info *j;
2074 n_allocated = MAX(n_allocated * 2U, 8U);
2075 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2085 list[n_list].filename = p;
2086 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2087 list[n_list].seqnum = seqnum;
2088 list[n_list].realtime = realtime;
2089 list[n_list].seqnum_id = seqnum_id;
2090 list[n_list].have_seqnum = have_seqnum;
2092 sum += list[n_list].usage;
2097 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2099 for(i = 0; i < n_list; i++) {
2102 if (fstatvfs(dirfd(d), &ss) < 0) {
2107 if (sum <= max_use &&
2108 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2111 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2112 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2113 sum -= list[i].usage;
2114 } else if (errno != ENOENT)
2115 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2119 for (i = 0; i < n_list; i++)
2120 free(list[i].filename);
2130 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2132 uint64_t q, xor_hash = 0;
2145 ts.monotonic = le64toh(o->entry.monotonic);
2146 ts.realtime = le64toh(o->entry.realtime);
2148 if (to->tail_entry_monotonic_valid &&
2149 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2152 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2155 n = journal_file_entry_n_items(o);
2156 items = alloca(sizeof(EntryItem) * n);
2158 for (i = 0; i < n; i++) {
2165 q = le64toh(o->entry.items[i].object_offset);
2166 le_hash = o->entry.items[i].hash;
2168 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2172 if (le_hash != o->data.hash)
2175 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2178 /* We hit the limit on 32bit machines */
2179 if ((uint64_t) t != l)
2182 if (o->object.flags & OBJECT_COMPRESSED) {
2186 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2189 data = from->compress_buffer;
2192 return -EPROTONOSUPPORT;
2195 data = o->data.payload;
2197 r = journal_file_append_data(to, data, l, &u, &h);
2201 xor_hash ^= le64toh(u->data.hash);
2202 items[i].object_offset = htole64(h);
2203 items[i].hash = u->data.hash;
2205 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2210 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2213 void journal_default_metrics(JournalMetrics *m, int fd) {
2214 uint64_t fs_size = 0;
2216 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2221 if (fstatvfs(fd, &ss) >= 0)
2222 fs_size = ss.f_frsize * ss.f_blocks;
2224 if (m->max_use == (uint64_t) -1) {
2227 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2229 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2230 m->max_use = DEFAULT_MAX_USE_UPPER;
2232 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2233 m->max_use = DEFAULT_MAX_USE_LOWER;
2235 m->max_use = DEFAULT_MAX_USE_LOWER;
2237 m->max_use = PAGE_ALIGN(m->max_use);
2239 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2240 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2243 if (m->max_size == (uint64_t) -1) {
2244 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2246 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2247 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2249 m->max_size = PAGE_ALIGN(m->max_size);
2251 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2252 m->max_size = JOURNAL_FILE_SIZE_MIN;
2254 if (m->max_size*2 > m->max_use)
2255 m->max_use = m->max_size*2;
2257 if (m->min_size == (uint64_t) -1)
2258 m->min_size = JOURNAL_FILE_SIZE_MIN;
2260 m->min_size = PAGE_ALIGN(m->min_size);
2262 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2263 m->min_size = JOURNAL_FILE_SIZE_MIN;
2265 if (m->min_size > m->max_size)
2266 m->max_size = m->min_size;
2269 if (m->keep_free == (uint64_t) -1) {
2272 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2274 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2275 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2278 m->keep_free = DEFAULT_KEEP_FREE;
2281 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2282 format_bytes(a, sizeof(a), m->max_use),
2283 format_bytes(b, sizeof(b), m->max_size),
2284 format_bytes(c, sizeof(c), m->min_size),
2285 format_bytes(d, sizeof(d), m->keep_free));