1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65 void journal_file_close(JournalFile *f) {
72 f->header->state = STATE_OFFLINE;
74 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
77 for (t = 0; t < _WINDOW_MAX; t++)
78 if (f->windows[t].ptr)
79 munmap(f->windows[t].ptr, f->windows[t].size);
82 close_nointr_nofail(f->fd);
87 free(f->compress_buffer);
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
101 memcpy(h.signature, signature, 8);
102 h.header_size = htole64(ALIGN64(sizeof(h)));
104 r = sd_id128_randomize(&h.file_id);
109 h.seqnum_id = template->header->seqnum_id;
110 h.seqnum = template->header->seqnum;
112 h.seqnum_id = h.file_id;
114 k = pwrite(f->fd, &h, sizeof(h), 0);
124 static int journal_file_refresh_header(JournalFile *f) {
130 r = sd_id128_get_machine(&f->header->machine_id);
134 r = sd_id128_get_boot(&boot_id);
138 if (sd_id128_equal(boot_id, f->header->boot_id))
139 f->tail_entry_monotonic_valid = true;
141 f->header->boot_id = boot_id;
143 f->header->state = STATE_ONLINE;
145 __sync_synchronize();
150 static int journal_file_verify_header(JournalFile *f) {
153 if (memcmp(f->header, signature, 8))
157 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158 return -EPROTONOSUPPORT;
160 if (f->header->incompatible_flags != 0)
161 return -EPROTONOSUPPORT;
164 if (f->header->header_size != htole64(ALIGN64(sizeof(*(f->header)))))
167 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
172 sd_id128_t machine_id;
175 r = sd_id128_get_machine(&machine_id);
179 if (!sd_id128_equal(machine_id, f->header->machine_id))
182 state = f->header->state;
184 if (state == STATE_ONLINE)
185 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
186 else if (state == STATE_ARCHIVED)
188 else if (state != STATE_OFFLINE)
189 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
195 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
196 uint64_t old_size, new_size;
201 /* We assume that this file is not sparse, and we know that
202 * for sure, since we always call posix_fallocate()
206 le64toh(f->header->header_size) +
207 le64toh(f->header->arena_size);
209 new_size = PAGE_ALIGN(offset + size);
210 if (new_size < le64toh(f->header->header_size))
211 new_size = le64toh(f->header->header_size);
213 if (new_size <= old_size)
216 if (f->metrics.max_size > 0 &&
217 new_size > f->metrics.max_size)
220 if (new_size > f->metrics.min_size &&
221 f->metrics.keep_free > 0) {
224 if (fstatvfs(f->fd, &svfs) >= 0) {
227 available = svfs.f_bfree * svfs.f_bsize;
229 if (available >= f->metrics.keep_free)
230 available -= f->metrics.keep_free;
234 if (new_size - old_size > available)
239 /* Note that the glibc fallocate() fallback is very
240 inefficient, hence we try to minimize the allocation area
242 r = posix_fallocate(f->fd, old_size, new_size - old_size);
246 if (fstat(f->fd, &f->last_stat) < 0)
249 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
254 static int journal_file_map(
263 uint64_t woffset, wsize;
270 woffset = offset & ~((uint64_t) page_size() - 1ULL);
271 wsize = size + (offset - woffset);
272 wsize = PAGE_ALIGN(wsize);
274 /* Avoid SIGBUS on invalid accesses */
275 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
276 return -EADDRNOTAVAIL;
278 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
279 if (window == MAP_FAILED)
291 *ret = (uint8_t*) window + (offset - woffset);
296 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
305 assert(wt < _WINDOW_MAX);
307 if (offset + size > (uint64_t) f->last_stat.st_size) {
308 /* Hmm, out of range? Let's refresh the fstat() data
309 * first, before we trust that check. */
311 if (fstat(f->fd, &f->last_stat) < 0 ||
312 offset + size > (uint64_t) f->last_stat.st_size)
313 return -EADDRNOTAVAIL;
318 if (_likely_(w->ptr &&
319 w->offset <= offset &&
320 w->offset + w->size >= offset + size)) {
322 *ret = (uint8_t*) w->ptr + (offset - w->offset);
327 if (munmap(w->ptr, w->size) < 0)
331 w->size = w->offset = 0;
334 if (size < DEFAULT_WINDOW_SIZE) {
335 /* If the default window size is larger then what was
336 * asked for extend the mapping a bit in the hope to
337 * minimize needed remappings later on. We add half
338 * the window space before and half behind the
339 * requested mapping */
341 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
347 size = DEFAULT_WINDOW_SIZE;
351 if (offset + size > (uint64_t) f->last_stat.st_size)
352 size = (uint64_t) f->last_stat.st_size - offset;
355 return -EADDRNOTAVAIL;
357 r = journal_file_map(f,
359 &w->ptr, &w->offset, &w->size,
365 *ret = (uint8_t*) p + delta;
369 static bool verify_hash(Object *o) {
374 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
375 h1 = le64toh(o->data.hash);
376 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
377 } else if (o->object.type == OBJECT_FIELD) {
378 h1 = le64toh(o->field.hash);
379 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
386 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
394 assert(type < _OBJECT_TYPE_MAX);
396 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
401 s = le64toh(o->object.size);
403 if (s < sizeof(ObjectHeader))
406 if (type >= 0 && o->object.type != type)
409 if (s > sizeof(ObjectHeader)) {
410 r = journal_file_move_to(f, o->object.type, offset, s, &t);
424 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
429 r = le64toh(f->header->seqnum) + 1;
432 /* If an external seqnum counter was passed, we update
433 * both the local and the external one, and set it to
434 * the maximum of both */
442 f->header->seqnum = htole64(r);
444 if (f->header->first_seqnum == 0)
445 f->header->first_seqnum = htole64(r);
450 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
457 assert(size >= sizeof(ObjectHeader));
461 p = le64toh(f->header->tail_object_offset);
463 p = le64toh(f->header->header_size);
465 r = journal_file_move_to_object(f, -1, p, &tail);
469 p += ALIGN64(le64toh(tail->object.size));
472 r = journal_file_allocate(f, p, size);
476 r = journal_file_move_to(f, type, p, size, &t);
483 o->object.type = type;
484 o->object.size = htole64(size);
486 f->header->tail_object_offset = htole64(p);
487 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
495 static int journal_file_setup_data_hash_table(JournalFile *f) {
502 s = DEFAULT_DATA_HASH_TABLE_SIZE;
503 r = journal_file_append_object(f,
504 OBJECT_DATA_HASH_TABLE,
505 offsetof(Object, hash_table.items) + s,
510 memset(o->hash_table.items, 0, s);
512 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
513 f->header->data_hash_table_size = htole64(s);
518 static int journal_file_setup_field_hash_table(JournalFile *f) {
525 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
526 r = journal_file_append_object(f,
527 OBJECT_FIELD_HASH_TABLE,
528 offsetof(Object, hash_table.items) + s,
533 memset(o->hash_table.items, 0, s);
535 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
536 f->header->field_hash_table_size = htole64(s);
541 static int journal_file_map_data_hash_table(JournalFile *f) {
548 p = le64toh(f->header->data_hash_table_offset);
549 s = le64toh(f->header->data_hash_table_size);
551 r = journal_file_move_to(f,
552 WINDOW_DATA_HASH_TABLE,
558 f->data_hash_table = t;
562 static int journal_file_map_field_hash_table(JournalFile *f) {
569 p = le64toh(f->header->field_hash_table_offset);
570 s = le64toh(f->header->field_hash_table_size);
572 r = journal_file_move_to(f,
573 WINDOW_FIELD_HASH_TABLE,
579 f->field_hash_table = t;
583 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
590 assert(o->object.type == OBJECT_DATA);
592 /* This might alter the window we are looking at */
594 o->data.next_hash_offset = o->data.next_field_offset = 0;
595 o->data.entry_offset = o->data.entry_array_offset = 0;
596 o->data.n_entries = 0;
598 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
599 p = le64toh(f->data_hash_table[h].head_hash_offset);
601 /* Only entry in the hash table is easy */
602 f->data_hash_table[h].head_hash_offset = htole64(offset);
604 /* Move back to the previous data object, to patch in
607 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
611 o->data.next_hash_offset = htole64(offset);
614 f->data_hash_table[h].tail_hash_offset = htole64(offset);
619 int journal_file_find_data_object_with_hash(
621 const void *data, uint64_t size, uint64_t hash,
622 Object **ret, uint64_t *offset) {
624 uint64_t p, osize, h;
628 assert(data || size == 0);
630 osize = offsetof(Object, data.payload) + size;
632 if (f->header->data_hash_table_size == 0)
635 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
636 p = le64toh(f->data_hash_table[h].head_hash_offset);
641 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
645 if (le64toh(o->data.hash) != hash)
648 if (o->object.flags & OBJECT_COMPRESSED) {
652 l = le64toh(o->object.size);
653 if (l <= offsetof(Object, data.payload))
656 l -= offsetof(Object, data.payload);
658 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
662 memcmp(f->compress_buffer, data, size) == 0) {
673 return -EPROTONOSUPPORT;
676 } else if (le64toh(o->object.size) == osize &&
677 memcmp(o->data.payload, data, size) == 0) {
689 p = le64toh(o->data.next_hash_offset);
695 int journal_file_find_data_object(
697 const void *data, uint64_t size,
698 Object **ret, uint64_t *offset) {
703 assert(data || size == 0);
705 hash = hash64(data, size);
707 return journal_file_find_data_object_with_hash(f,
712 static int journal_file_append_data(
714 const void *data, uint64_t size,
715 Object **ret, uint64_t *offset) {
721 bool compressed = false;
724 assert(data || size == 0);
726 hash = hash64(data, size);
728 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
742 osize = offsetof(Object, data.payload) + size;
743 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
747 o->data.hash = htole64(hash);
751 size >= COMPRESSION_SIZE_THRESHOLD) {
754 compressed = compress_blob(data, size, o->data.payload, &rsize);
757 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
758 o->object.flags |= OBJECT_COMPRESSED;
760 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
762 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
768 memcpy(o->data.payload, data, size);
770 r = journal_file_link_data(f, o, p, hash);
774 /* The linking might have altered the window, so let's
775 * refresh our pointer */
776 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
789 uint64_t journal_file_entry_n_items(Object *o) {
791 assert(o->object.type == OBJECT_ENTRY);
793 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
796 static uint64_t journal_file_entry_array_n_items(Object *o) {
798 assert(o->object.type == OBJECT_ENTRY_ARRAY);
800 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
803 static int link_entry_into_array(JournalFile *f,
808 uint64_t n = 0, ap = 0, q, i, a, hidx;
817 i = hidx = le64toh(*idx);
820 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
824 n = journal_file_entry_array_n_items(o);
826 o->entry_array.items[i] = htole64(p);
827 *idx = htole64(hidx + 1);
833 a = le64toh(o->entry_array.next_entry_array_offset);
844 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
845 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
850 o->entry_array.items[i] = htole64(p);
855 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
859 o->entry_array.next_entry_array_offset = htole64(q);
862 *idx = htole64(hidx + 1);
867 static int link_entry_into_array_plus_one(JournalFile *f,
886 i = htole64(le64toh(*idx) - 1);
887 r = link_entry_into_array(f, first, &i, p);
892 *idx = htole64(le64toh(*idx) + 1);
896 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
903 p = le64toh(o->entry.items[i].object_offset);
907 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
911 return link_entry_into_array_plus_one(f,
912 &o->data.entry_offset,
913 &o->data.entry_array_offset,
918 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
925 assert(o->object.type == OBJECT_ENTRY);
927 __sync_synchronize();
929 /* Link up the entry itself */
930 r = link_entry_into_array(f,
931 &f->header->entry_array_offset,
932 &f->header->n_entries,
937 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
939 if (f->header->head_entry_realtime == 0)
940 f->header->head_entry_realtime = o->entry.realtime;
942 f->header->tail_entry_realtime = o->entry.realtime;
943 f->header->tail_entry_monotonic = o->entry.monotonic;
945 f->tail_entry_monotonic_valid = true;
947 /* Link up the items */
948 n = journal_file_entry_n_items(o);
949 for (i = 0; i < n; i++) {
950 r = journal_file_link_entry_item(f, o, offset, i);
958 static int journal_file_append_entry_internal(
960 const dual_timestamp *ts,
962 const EntryItem items[], unsigned n_items,
964 Object **ret, uint64_t *offset) {
971 assert(items || n_items == 0);
974 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
976 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
980 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
981 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
982 o->entry.realtime = htole64(ts->realtime);
983 o->entry.monotonic = htole64(ts->monotonic);
984 o->entry.xor_hash = htole64(xor_hash);
985 o->entry.boot_id = f->header->boot_id;
987 r = journal_file_link_entry(f, o, np);
1000 void journal_file_post_change(JournalFile *f) {
1003 /* inotify() does not receive IN_MODIFY events from file
1004 * accesses done via mmap(). After each access we hence
1005 * trigger IN_MODIFY by truncating the journal file to its
1006 * current size which triggers IN_MODIFY. */
1008 __sync_synchronize();
1010 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1011 log_error("Failed to to truncate file to its own size: %m");
1014 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1018 uint64_t xor_hash = 0;
1019 struct dual_timestamp _ts;
1022 assert(iovec || n_iovec == 0);
1028 dual_timestamp_get(&_ts);
1032 if (f->tail_entry_monotonic_valid &&
1033 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1036 items = alloca(sizeof(EntryItem) * n_iovec);
1038 for (i = 0; i < n_iovec; i++) {
1042 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1046 xor_hash ^= le64toh(o->data.hash);
1047 items[i].object_offset = htole64(p);
1048 items[i].hash = o->data.hash;
1051 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1053 journal_file_post_change(f);
1058 static int generic_array_get(JournalFile *f,
1061 Object **ret, uint64_t *offset) {
1073 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1077 n = journal_file_entry_array_n_items(o);
1079 p = le64toh(o->entry_array.items[i]);
1084 a = le64toh(o->entry_array.next_entry_array_offset);
1087 if (a <= 0 || p <= 0)
1090 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1103 static int generic_array_get_plus_one(JournalFile *f,
1107 Object **ret, uint64_t *offset) {
1116 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1129 return generic_array_get(f, first, i-1, ret, offset);
1138 static int generic_array_bisect(JournalFile *f,
1142 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1143 direction_t direction,
1148 uint64_t a, p, t = 0, i = 0, last_p = 0;
1149 bool subtract_one = false;
1150 Object *o, *array = NULL;
1154 assert(test_object);
1158 uint64_t left, right, k, lp;
1160 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1164 k = journal_file_entry_array_n_items(array);
1170 lp = p = le64toh(array->entry_array.items[i]);
1174 r = test_object(f, p, needle);
1178 if (r == TEST_FOUND)
1179 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1181 if (r == TEST_RIGHT) {
1185 if (left == right) {
1186 if (direction == DIRECTION_UP)
1187 subtract_one = true;
1193 assert(left < right);
1195 i = (left + right) / 2;
1196 p = le64toh(array->entry_array.items[i]);
1200 r = test_object(f, p, needle);
1204 if (r == TEST_FOUND)
1205 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1207 if (r == TEST_RIGHT)
1221 a = le64toh(array->entry_array.next_entry_array_offset);
1227 if (subtract_one && t == 0 && i == 0)
1230 if (subtract_one && i == 0)
1232 else if (subtract_one)
1233 p = le64toh(array->entry_array.items[i-1]);
1235 p = le64toh(array->entry_array.items[i]);
1237 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1248 *idx = t + i - (subtract_one ? 1 : 0);
1253 static int generic_array_bisect_plus_one(JournalFile *f,
1258 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1259 direction_t direction,
1267 assert(test_object);
1272 /* This bisects the array in object 'first', but first checks
1274 r = test_object(f, extra, needle);
1277 else if (r == TEST_FOUND) {
1280 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1294 } else if (r == TEST_RIGHT)
1297 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1305 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1312 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1316 if (le64toh(o->entry.seqnum) == needle)
1318 else if (le64toh(o->entry.seqnum) < needle)
1324 int journal_file_move_to_entry_by_seqnum(
1327 direction_t direction,
1331 return generic_array_bisect(f,
1332 le64toh(f->header->entry_array_offset),
1333 le64toh(f->header->n_entries),
1340 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1347 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1351 if (le64toh(o->entry.realtime) == needle)
1353 else if (le64toh(o->entry.realtime) < needle)
1359 int journal_file_move_to_entry_by_realtime(
1362 direction_t direction,
1366 return generic_array_bisect(f,
1367 le64toh(f->header->entry_array_offset),
1368 le64toh(f->header->n_entries),
1370 test_object_realtime,
1375 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1382 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1386 if (le64toh(o->entry.monotonic) == needle)
1388 else if (le64toh(o->entry.monotonic) < needle)
1394 int journal_file_move_to_entry_by_monotonic(
1398 direction_t direction,
1402 char t[8+32+1] = "_BOOT_ID=";
1406 sd_id128_to_string(boot_id, t + 8);
1408 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1414 return generic_array_bisect_plus_one(f,
1415 le64toh(o->data.entry_offset),
1416 le64toh(o->data.entry_array_offset),
1417 le64toh(o->data.n_entries),
1419 test_object_monotonic,
1424 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1430 else if (p < needle)
1436 int journal_file_next_entry(
1438 Object *o, uint64_t p,
1439 direction_t direction,
1440 Object **ret, uint64_t *offset) {
1446 assert(p > 0 || !o);
1448 n = le64toh(f->header->n_entries);
1453 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1455 if (o->object.type != OBJECT_ENTRY)
1458 r = generic_array_bisect(f,
1459 le64toh(f->header->entry_array_offset),
1460 le64toh(f->header->n_entries),
1469 if (direction == DIRECTION_DOWN) {
1482 /* And jump to it */
1483 return generic_array_get(f,
1484 le64toh(f->header->entry_array_offset),
1489 int journal_file_skip_entry(
1491 Object *o, uint64_t p,
1493 Object **ret, uint64_t *offset) {
1502 if (o->object.type != OBJECT_ENTRY)
1505 r = generic_array_bisect(f,
1506 le64toh(f->header->entry_array_offset),
1507 le64toh(f->header->n_entries),
1516 /* Calculate new index */
1518 if ((uint64_t) -skip >= i)
1521 i = i - (uint64_t) -skip;
1523 i += (uint64_t) skip;
1525 n = le64toh(f->header->n_entries);
1532 return generic_array_get(f,
1533 le64toh(f->header->entry_array_offset),
1538 int journal_file_next_entry_for_data(
1540 Object *o, uint64_t p,
1541 uint64_t data_offset,
1542 direction_t direction,
1543 Object **ret, uint64_t *offset) {
1550 assert(p > 0 || !o);
1552 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1556 n = le64toh(d->data.n_entries);
1561 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1563 if (o->object.type != OBJECT_ENTRY)
1566 r = generic_array_bisect_plus_one(f,
1567 le64toh(d->data.entry_offset),
1568 le64toh(d->data.entry_array_offset),
1569 le64toh(d->data.n_entries),
1579 if (direction == DIRECTION_DOWN) {
1593 return generic_array_get_plus_one(f,
1594 le64toh(d->data.entry_offset),
1595 le64toh(d->data.entry_array_offset),
1600 int journal_file_move_to_entry_by_seqnum_for_data(
1602 uint64_t data_offset,
1604 direction_t direction,
1605 Object **ret, uint64_t *offset) {
1610 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1614 return generic_array_bisect_plus_one(f,
1615 le64toh(d->data.entry_offset),
1616 le64toh(d->data.entry_array_offset),
1617 le64toh(d->data.n_entries),
1624 int journal_file_move_to_entry_by_realtime_for_data(
1626 uint64_t data_offset,
1628 direction_t direction,
1629 Object **ret, uint64_t *offset) {
1634 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1638 return generic_array_bisect_plus_one(f,
1639 le64toh(d->data.entry_offset),
1640 le64toh(d->data.entry_array_offset),
1641 le64toh(d->data.n_entries),
1643 test_object_realtime,
1648 void journal_file_dump(JournalFile *f) {
1649 char a[33], b[33], c[33];
1656 printf("File Path: %s\n"
1660 "Arena size: %llu\n"
1664 sd_id128_to_string(f->header->file_id, a),
1665 sd_id128_to_string(f->header->machine_id, b),
1666 sd_id128_to_string(f->header->boot_id, c),
1667 (unsigned long long) le64toh(f->header->arena_size),
1668 (unsigned long) le64toh(f->header->n_objects),
1669 (unsigned long) le64toh(f->header->n_entries));
1671 p = le64toh(f->header->header_size);
1673 r = journal_file_move_to_object(f, -1, p, &o);
1677 switch (o->object.type) {
1680 printf("Type: OBJECT_UNUSED\n");
1684 printf("Type: OBJECT_DATA\n");
1688 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1689 (unsigned long long) le64toh(o->entry.seqnum),
1690 (unsigned long long) le64toh(o->entry.monotonic),
1691 (unsigned long long) le64toh(o->entry.realtime));
1694 case OBJECT_FIELD_HASH_TABLE:
1695 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1698 case OBJECT_DATA_HASH_TABLE:
1699 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1702 case OBJECT_ENTRY_ARRAY:
1703 printf("Type: OBJECT_ENTRY_ARRAY\n");
1707 if (o->object.flags & OBJECT_COMPRESSED)
1708 printf("Flags: COMPRESSED\n");
1710 if (p == le64toh(f->header->tail_object_offset))
1713 p = p + ALIGN64(le64toh(o->object.size));
1718 log_error("File corrupt");
1721 int journal_file_open(
1725 JournalFile *template,
1726 JournalFile **ret) {
1730 bool newly_created = false;
1734 if ((flags & O_ACCMODE) != O_RDONLY &&
1735 (flags & O_ACCMODE) != O_RDWR)
1738 if (!endswith(fname, ".journal"))
1741 f = new0(JournalFile, 1);
1748 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1749 f->prot = prot_from_flags(flags);
1752 f->metrics = template->metrics;
1753 f->compress = template->compress;
1756 f->path = strdup(fname);
1762 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1768 if (fstat(f->fd, &f->last_stat) < 0) {
1773 if (f->last_stat.st_size == 0 && f->writable) {
1774 newly_created = true;
1776 r = journal_file_init_header(f, template);
1780 if (fstat(f->fd, &f->last_stat) < 0) {
1786 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1791 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1792 if (f->header == MAP_FAILED) {
1798 if (!newly_created) {
1799 r = journal_file_verify_header(f);
1805 r = journal_file_refresh_header(f);
1810 if (newly_created) {
1812 r = journal_file_setup_field_hash_table(f);
1816 r = journal_file_setup_data_hash_table(f);
1821 r = journal_file_map_field_hash_table(f);
1825 r = journal_file_map_data_hash_table(f);
1835 journal_file_close(f);
1840 int journal_file_rotate(JournalFile **f) {
1843 JournalFile *old_file, *new_file = NULL;
1851 if (!old_file->writable)
1854 if (!endswith(old_file->path, ".journal"))
1857 l = strlen(old_file->path);
1859 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1863 memcpy(p, old_file->path, l - 8);
1865 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1866 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1867 "-%016llx-%016llx.journal",
1868 (unsigned long long) le64toh((*f)->header->seqnum),
1869 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1871 r = rename(old_file->path, p);
1877 old_file->header->state = STATE_ARCHIVED;
1879 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1880 journal_file_close(old_file);
1886 int journal_file_open_reliably(
1890 JournalFile *template,
1891 JournalFile **ret) {
1897 r = journal_file_open(fname, flags, mode, template, ret);
1898 if (r != -EBADMSG && /* corrupted */
1899 r != -ENODATA && /* truncated */
1900 r != -EHOSTDOWN && /* other machine */
1901 r != -EPROTONOSUPPORT) /* incompatible feature */
1904 if ((flags & O_ACCMODE) == O_RDONLY)
1907 if (!(flags & O_CREAT))
1910 /* The file is corrupted. Rotate it away and try it again (but only once) */
1913 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1915 (unsigned long long) now(CLOCK_REALTIME),
1919 r = rename(fname, p);
1924 log_warning("File %s corrupted, renaming and replacing.", fname);
1926 return journal_file_open(fname, flags, mode, template, ret);
1929 struct vacuum_info {
1934 sd_id128_t seqnum_id;
1940 static int vacuum_compare(const void *_a, const void *_b) {
1941 const struct vacuum_info *a, *b;
1946 if (a->have_seqnum && b->have_seqnum &&
1947 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1948 if (a->seqnum < b->seqnum)
1950 else if (a->seqnum > b->seqnum)
1956 if (a->realtime < b->realtime)
1958 else if (a->realtime > b->realtime)
1960 else if (a->have_seqnum && b->have_seqnum)
1961 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1963 return strcmp(a->filename, b->filename);
1966 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1969 struct vacuum_info *list = NULL;
1970 unsigned n_list = 0, n_allocated = 0, i;
1978 d = opendir(directory);
1984 struct dirent buf, *de;
1988 unsigned long long seqnum = 0, realtime;
1989 sd_id128_t seqnum_id;
1992 k = readdir_r(d, &buf, &de);
2001 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2004 if (!S_ISREG(st.st_mode))
2007 q = strlen(de->d_name);
2009 if (endswith(de->d_name, ".journal")) {
2011 /* Vacuum archived files */
2013 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2016 if (de->d_name[q-8-16-1] != '-' ||
2017 de->d_name[q-8-16-1-16-1] != '-' ||
2018 de->d_name[q-8-16-1-16-1-32-1] != '@')
2021 p = strdup(de->d_name);
2027 de->d_name[q-8-16-1-16-1] = 0;
2028 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2033 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2040 } else if (endswith(de->d_name, ".journal~")) {
2041 unsigned long long tmp;
2043 /* Vacuum corrupted files */
2045 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2048 if (de->d_name[q-1-8-16-1] != '-' ||
2049 de->d_name[q-1-8-16-1-16-1] != '@')
2052 p = strdup(de->d_name);
2058 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2063 have_seqnum = false;
2067 if (n_list >= n_allocated) {
2068 struct vacuum_info *j;
2070 n_allocated = MAX(n_allocated * 2U, 8U);
2071 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2081 list[n_list].filename = p;
2082 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2083 list[n_list].seqnum = seqnum;
2084 list[n_list].realtime = realtime;
2085 list[n_list].seqnum_id = seqnum_id;
2086 list[n_list].have_seqnum = have_seqnum;
2088 sum += list[n_list].usage;
2093 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2095 for(i = 0; i < n_list; i++) {
2098 if (fstatvfs(dirfd(d), &ss) < 0) {
2103 if (sum <= max_use &&
2104 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2107 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2108 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2109 sum -= list[i].usage;
2110 } else if (errno != ENOENT)
2111 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2115 for (i = 0; i < n_list; i++)
2116 free(list[i].filename);
2126 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2128 uint64_t q, xor_hash = 0;
2141 ts.monotonic = le64toh(o->entry.monotonic);
2142 ts.realtime = le64toh(o->entry.realtime);
2144 if (to->tail_entry_monotonic_valid &&
2145 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2148 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2151 n = journal_file_entry_n_items(o);
2152 items = alloca(sizeof(EntryItem) * n);
2154 for (i = 0; i < n; i++) {
2161 q = le64toh(o->entry.items[i].object_offset);
2162 le_hash = o->entry.items[i].hash;
2164 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2168 if (le_hash != o->data.hash)
2171 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2174 /* We hit the limit on 32bit machines */
2175 if ((uint64_t) t != l)
2178 if (o->object.flags & OBJECT_COMPRESSED) {
2182 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2185 data = from->compress_buffer;
2188 return -EPROTONOSUPPORT;
2191 data = o->data.payload;
2193 r = journal_file_append_data(to, data, l, &u, &h);
2197 xor_hash ^= le64toh(u->data.hash);
2198 items[i].object_offset = htole64(h);
2199 items[i].hash = u->data.hash;
2201 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2206 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2209 void journal_default_metrics(JournalMetrics *m, int fd) {
2210 uint64_t fs_size = 0;
2212 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2217 if (fstatvfs(fd, &ss) >= 0)
2218 fs_size = ss.f_frsize * ss.f_blocks;
2220 if (m->max_use == (uint64_t) -1) {
2223 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2225 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2226 m->max_use = DEFAULT_MAX_USE_UPPER;
2228 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2229 m->max_use = DEFAULT_MAX_USE_LOWER;
2231 m->max_use = DEFAULT_MAX_USE_LOWER;
2233 m->max_use = PAGE_ALIGN(m->max_use);
2235 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2236 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2239 if (m->max_size == (uint64_t) -1) {
2240 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2242 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2243 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2245 m->max_size = PAGE_ALIGN(m->max_size);
2247 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2248 m->max_size = JOURNAL_FILE_SIZE_MIN;
2250 if (m->max_size*2 > m->max_use)
2251 m->max_use = m->max_size*2;
2253 if (m->min_size == (uint64_t) -1)
2254 m->min_size = JOURNAL_FILE_SIZE_MIN;
2256 m->min_size = PAGE_ALIGN(m->min_size);
2258 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2259 m->min_size = JOURNAL_FILE_SIZE_MIN;
2261 if (m->min_size > m->max_size)
2262 m->max_size = m->min_size;
2265 if (m->keep_free == (uint64_t) -1) {
2268 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2270 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2271 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2274 m->keep_free = DEFAULT_KEEP_FREE;
2277 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2278 format_bytes(a, sizeof(a), m->max_use),
2279 format_bytes(b, sizeof(b), m->max_size),
2280 format_bytes(c, sizeof(c), m->min_size),
2281 format_bytes(d, sizeof(d), m->keep_free));