1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65 void journal_file_close(JournalFile *f) {
72 f->header->state = STATE_OFFLINE;
74 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
77 for (t = 0; t < _WINDOW_MAX; t++)
78 if (f->windows[t].ptr)
79 munmap(f->windows[t].ptr, f->windows[t].size);
82 close_nointr_nofail(f->fd);
87 free(f->compress_buffer);
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
101 memcpy(h.signature, signature, 8);
102 h.arena_offset = htole64(ALIGN64(sizeof(h)));
104 r = sd_id128_randomize(&h.file_id);
109 h.seqnum_id = template->header->seqnum_id;
110 h.seqnum = template->header->seqnum;
112 h.seqnum_id = h.file_id;
114 k = pwrite(f->fd, &h, sizeof(h), 0);
124 static int journal_file_refresh_header(JournalFile *f) {
130 r = sd_id128_get_machine(&f->header->machine_id);
134 r = sd_id128_get_boot(&boot_id);
138 if (sd_id128_equal(boot_id, f->header->boot_id))
139 f->tail_entry_monotonic_valid = true;
141 f->header->boot_id = boot_id;
143 f->header->state = STATE_ONLINE;
145 __sync_synchronize();
150 static int journal_file_verify_header(JournalFile *f) {
153 if (memcmp(f->header, signature, 8))
157 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158 return -EPROTONOSUPPORT;
160 if (f->header->incompatible_flags != 0)
161 return -EPROTONOSUPPORT;
164 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
169 sd_id128_t machine_id;
172 r = sd_id128_get_machine(&machine_id);
176 if (!sd_id128_equal(machine_id, f->header->machine_id))
179 state = f->header->state;
181 if (state == STATE_ONLINE)
182 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
183 else if (state == STATE_ARCHIVED)
185 else if (state != STATE_OFFLINE)
186 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
192 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
193 uint64_t old_size, new_size;
198 /* We assume that this file is not sparse, and we know that
199 * for sure, since we always call posix_fallocate()
203 le64toh(f->header->arena_offset) +
204 le64toh(f->header->arena_size);
206 new_size = PAGE_ALIGN(offset + size);
207 if (new_size < le64toh(f->header->arena_offset))
208 new_size = le64toh(f->header->arena_offset);
210 if (new_size <= old_size)
213 if (f->metrics.max_size > 0 &&
214 new_size > f->metrics.max_size)
217 if (new_size > f->metrics.min_size &&
218 f->metrics.keep_free > 0) {
221 if (fstatvfs(f->fd, &svfs) >= 0) {
224 available = svfs.f_bfree * svfs.f_bsize;
226 if (available >= f->metrics.keep_free)
227 available -= f->metrics.keep_free;
231 if (new_size - old_size > available)
236 /* Note that the glibc fallocate() fallback is very
237 inefficient, hence we try to minimize the allocation area
239 r = posix_fallocate(f->fd, old_size, new_size - old_size);
243 if (fstat(f->fd, &f->last_stat) < 0)
246 f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
251 static int journal_file_map(
260 uint64_t woffset, wsize;
267 woffset = offset & ~((uint64_t) page_size() - 1ULL);
268 wsize = size + (offset - woffset);
269 wsize = PAGE_ALIGN(wsize);
271 /* Avoid SIGBUS on invalid accesses */
272 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
273 return -EADDRNOTAVAIL;
275 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
276 if (window == MAP_FAILED)
288 *ret = (uint8_t*) window + (offset - woffset);
293 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
302 assert(wt < _WINDOW_MAX);
304 if (offset + size > (uint64_t) f->last_stat.st_size) {
305 /* Hmm, out of range? Let's refresh the fstat() data
306 * first, before we trust that check. */
308 if (fstat(f->fd, &f->last_stat) < 0 ||
309 offset + size > (uint64_t) f->last_stat.st_size)
310 return -EADDRNOTAVAIL;
315 if (_likely_(w->ptr &&
316 w->offset <= offset &&
317 w->offset + w->size >= offset + size)) {
319 *ret = (uint8_t*) w->ptr + (offset - w->offset);
324 if (munmap(w->ptr, w->size) < 0)
328 w->size = w->offset = 0;
331 if (size < DEFAULT_WINDOW_SIZE) {
332 /* If the default window size is larger then what was
333 * asked for extend the mapping a bit in the hope to
334 * minimize needed remappings later on. We add half
335 * the window space before and half behind the
336 * requested mapping */
338 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
344 size = DEFAULT_WINDOW_SIZE;
348 if (offset + size > (uint64_t) f->last_stat.st_size)
349 size = (uint64_t) f->last_stat.st_size - offset;
352 return -EADDRNOTAVAIL;
354 r = journal_file_map(f,
356 &w->ptr, &w->offset, &w->size,
362 *ret = (uint8_t*) p + delta;
366 static bool verify_hash(Object *o) {
371 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
372 h1 = le64toh(o->data.hash);
373 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
374 } else if (o->object.type == OBJECT_FIELD) {
375 h1 = le64toh(o->field.hash);
376 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
383 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
391 assert(type < _OBJECT_TYPE_MAX);
393 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
398 s = le64toh(o->object.size);
400 if (s < sizeof(ObjectHeader))
403 if (type >= 0 && o->object.type != type)
406 if (s > sizeof(ObjectHeader)) {
407 r = journal_file_move_to(f, o->object.type, offset, s, &t);
421 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
426 r = le64toh(f->header->seqnum) + 1;
429 /* If an external seqnum counter was passed, we update
430 * both the local and the external one, and set it to
431 * the maximum of both */
439 f->header->seqnum = htole64(r);
441 if (f->header->first_seqnum == 0)
442 f->header->first_seqnum = htole64(r);
447 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
454 assert(size >= sizeof(ObjectHeader));
458 p = le64toh(f->header->tail_object_offset);
460 p = le64toh(f->header->arena_offset);
462 r = journal_file_move_to_object(f, -1, p, &tail);
466 p += ALIGN64(le64toh(tail->object.size));
469 r = journal_file_allocate(f, p, size);
473 r = journal_file_move_to(f, type, p, size, &t);
480 o->object.type = type;
481 o->object.size = htole64(size);
483 f->header->tail_object_offset = htole64(p);
484 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
492 static int journal_file_setup_data_hash_table(JournalFile *f) {
499 s = DEFAULT_DATA_HASH_TABLE_SIZE;
500 r = journal_file_append_object(f,
501 OBJECT_DATA_HASH_TABLE,
502 offsetof(Object, hash_table.items) + s,
507 memset(o->hash_table.items, 0, s);
509 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
510 f->header->data_hash_table_size = htole64(s);
515 static int journal_file_setup_field_hash_table(JournalFile *f) {
522 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
523 r = journal_file_append_object(f,
524 OBJECT_FIELD_HASH_TABLE,
525 offsetof(Object, hash_table.items) + s,
530 memset(o->hash_table.items, 0, s);
532 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
533 f->header->field_hash_table_size = htole64(s);
538 static int journal_file_map_data_hash_table(JournalFile *f) {
545 p = le64toh(f->header->data_hash_table_offset);
546 s = le64toh(f->header->data_hash_table_size);
548 r = journal_file_move_to(f,
549 WINDOW_DATA_HASH_TABLE,
555 f->data_hash_table = t;
559 static int journal_file_map_field_hash_table(JournalFile *f) {
566 p = le64toh(f->header->field_hash_table_offset);
567 s = le64toh(f->header->field_hash_table_size);
569 r = journal_file_move_to(f,
570 WINDOW_FIELD_HASH_TABLE,
576 f->field_hash_table = t;
580 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
587 assert(o->object.type == OBJECT_DATA);
589 /* This might alter the window we are looking at */
591 o->data.next_hash_offset = o->data.next_field_offset = 0;
592 o->data.entry_offset = o->data.entry_array_offset = 0;
593 o->data.n_entries = 0;
595 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
596 p = le64toh(f->data_hash_table[h].head_hash_offset);
598 /* Only entry in the hash table is easy */
599 f->data_hash_table[h].head_hash_offset = htole64(offset);
601 /* Move back to the previous data object, to patch in
604 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
608 o->data.next_hash_offset = htole64(offset);
611 f->data_hash_table[h].tail_hash_offset = htole64(offset);
616 int journal_file_find_data_object_with_hash(
618 const void *data, uint64_t size, uint64_t hash,
619 Object **ret, uint64_t *offset) {
621 uint64_t p, osize, h;
625 assert(data || size == 0);
627 osize = offsetof(Object, data.payload) + size;
629 if (f->header->data_hash_table_size == 0)
632 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
633 p = le64toh(f->data_hash_table[h].head_hash_offset);
638 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
642 if (le64toh(o->data.hash) != hash)
645 if (o->object.flags & OBJECT_COMPRESSED) {
649 l = le64toh(o->object.size);
650 if (l <= offsetof(Object, data.payload))
653 l -= offsetof(Object, data.payload);
655 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
659 memcmp(f->compress_buffer, data, size) == 0) {
670 return -EPROTONOSUPPORT;
673 } else if (le64toh(o->object.size) == osize &&
674 memcmp(o->data.payload, data, size) == 0) {
686 p = le64toh(o->data.next_hash_offset);
692 int journal_file_find_data_object(
694 const void *data, uint64_t size,
695 Object **ret, uint64_t *offset) {
700 assert(data || size == 0);
702 hash = hash64(data, size);
704 return journal_file_find_data_object_with_hash(f,
709 static int journal_file_append_data(
711 const void *data, uint64_t size,
712 Object **ret, uint64_t *offset) {
718 bool compressed = false;
721 assert(data || size == 0);
723 hash = hash64(data, size);
725 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
739 osize = offsetof(Object, data.payload) + size;
740 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
744 o->data.hash = htole64(hash);
748 size >= COMPRESSION_SIZE_THRESHOLD) {
751 compressed = compress_blob(data, size, o->data.payload, &rsize);
754 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
755 o->object.flags |= OBJECT_COMPRESSED;
757 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
759 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
765 memcpy(o->data.payload, data, size);
767 r = journal_file_link_data(f, o, p, hash);
771 /* The linking might have altered the window, so let's
772 * refresh our pointer */
773 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
786 uint64_t journal_file_entry_n_items(Object *o) {
788 assert(o->object.type == OBJECT_ENTRY);
790 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
793 static uint64_t journal_file_entry_array_n_items(Object *o) {
795 assert(o->object.type == OBJECT_ENTRY_ARRAY);
797 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
800 static int link_entry_into_array(JournalFile *f,
805 uint64_t n = 0, ap = 0, q, i, a, hidx;
814 i = hidx = le64toh(*idx);
817 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
821 n = journal_file_entry_array_n_items(o);
823 o->entry_array.items[i] = htole64(p);
824 *idx = htole64(hidx + 1);
830 a = le64toh(o->entry_array.next_entry_array_offset);
841 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
842 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
847 o->entry_array.items[i] = htole64(p);
852 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
856 o->entry_array.next_entry_array_offset = htole64(q);
859 *idx = htole64(hidx + 1);
864 static int link_entry_into_array_plus_one(JournalFile *f,
883 i = htole64(le64toh(*idx) - 1);
884 r = link_entry_into_array(f, first, &i, p);
889 *idx = htole64(le64toh(*idx) + 1);
893 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
900 p = le64toh(o->entry.items[i].object_offset);
904 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
908 return link_entry_into_array_plus_one(f,
909 &o->data.entry_offset,
910 &o->data.entry_array_offset,
915 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
922 assert(o->object.type == OBJECT_ENTRY);
924 __sync_synchronize();
926 /* Link up the entry itself */
927 r = link_entry_into_array(f,
928 &f->header->entry_array_offset,
929 &f->header->n_entries,
934 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
936 if (f->header->head_entry_realtime == 0)
937 f->header->head_entry_realtime = o->entry.realtime;
939 f->header->tail_entry_realtime = o->entry.realtime;
940 f->header->tail_entry_monotonic = o->entry.monotonic;
942 f->tail_entry_monotonic_valid = true;
944 /* Link up the items */
945 n = journal_file_entry_n_items(o);
946 for (i = 0; i < n; i++) {
947 r = journal_file_link_entry_item(f, o, offset, i);
955 static int journal_file_append_entry_internal(
957 const dual_timestamp *ts,
959 const EntryItem items[], unsigned n_items,
961 Object **ret, uint64_t *offset) {
968 assert(items || n_items == 0);
971 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
973 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
977 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
978 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
979 o->entry.realtime = htole64(ts->realtime);
980 o->entry.monotonic = htole64(ts->monotonic);
981 o->entry.xor_hash = htole64(xor_hash);
982 o->entry.boot_id = f->header->boot_id;
984 r = journal_file_link_entry(f, o, np);
997 void journal_file_post_change(JournalFile *f) {
1000 /* inotify() does not receive IN_MODIFY events from file
1001 * accesses done via mmap(). After each access we hence
1002 * trigger IN_MODIFY by truncating the journal file to its
1003 * current size which triggers IN_MODIFY. */
1005 __sync_synchronize();
1007 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1008 log_error("Failed to to truncate file to its own size: %m");
1011 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1015 uint64_t xor_hash = 0;
1016 struct dual_timestamp _ts;
1019 assert(iovec || n_iovec == 0);
1025 dual_timestamp_get(&_ts);
1029 if (f->tail_entry_monotonic_valid &&
1030 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1033 items = alloca(sizeof(EntryItem) * n_iovec);
1035 for (i = 0; i < n_iovec; i++) {
1039 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1043 xor_hash ^= le64toh(o->data.hash);
1044 items[i].object_offset = htole64(p);
1045 items[i].hash = o->data.hash;
1048 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1050 journal_file_post_change(f);
1055 static int generic_array_get(JournalFile *f,
1058 Object **ret, uint64_t *offset) {
1070 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1074 n = journal_file_entry_array_n_items(o);
1076 p = le64toh(o->entry_array.items[i]);
1081 a = le64toh(o->entry_array.next_entry_array_offset);
1084 if (a <= 0 || p <= 0)
1087 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1100 static int generic_array_get_plus_one(JournalFile *f,
1104 Object **ret, uint64_t *offset) {
1113 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1126 return generic_array_get(f, first, i-1, ret, offset);
1135 static int generic_array_bisect(JournalFile *f,
1139 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1140 direction_t direction,
1145 uint64_t a, p, t = 0, i = 0, last_p = 0;
1146 bool subtract_one = false;
1147 Object *o, *array = NULL;
1151 assert(test_object);
1155 uint64_t left, right, k, lp;
1157 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1161 k = journal_file_entry_array_n_items(array);
1167 lp = p = le64toh(array->entry_array.items[i]);
1171 r = test_object(f, p, needle);
1175 if (r == TEST_FOUND)
1176 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1178 if (r == TEST_RIGHT) {
1182 if (left == right) {
1183 if (direction == DIRECTION_UP)
1184 subtract_one = true;
1190 assert(left < right);
1192 i = (left + right) / 2;
1193 p = le64toh(array->entry_array.items[i]);
1197 r = test_object(f, p, needle);
1201 if (r == TEST_FOUND)
1202 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1204 if (r == TEST_RIGHT)
1218 a = le64toh(array->entry_array.next_entry_array_offset);
1224 if (subtract_one && t == 0 && i == 0)
1227 if (subtract_one && i == 0)
1229 else if (subtract_one)
1230 p = le64toh(array->entry_array.items[i-1]);
1232 p = le64toh(array->entry_array.items[i]);
1234 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1245 *idx = t + i - (subtract_one ? 1 : 0);
1250 static int generic_array_bisect_plus_one(JournalFile *f,
1255 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1256 direction_t direction,
1264 assert(test_object);
1269 /* This bisects the array in object 'first', but first checks
1271 r = test_object(f, extra, needle);
1274 else if (r == TEST_FOUND) {
1277 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1291 } else if (r == TEST_RIGHT)
1294 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1302 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1309 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1313 if (le64toh(o->entry.seqnum) == needle)
1315 else if (le64toh(o->entry.seqnum) < needle)
1321 int journal_file_move_to_entry_by_seqnum(
1324 direction_t direction,
1328 return generic_array_bisect(f,
1329 le64toh(f->header->entry_array_offset),
1330 le64toh(f->header->n_entries),
1337 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1344 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1348 if (le64toh(o->entry.realtime) == needle)
1350 else if (le64toh(o->entry.realtime) < needle)
1356 int journal_file_move_to_entry_by_realtime(
1359 direction_t direction,
1363 return generic_array_bisect(f,
1364 le64toh(f->header->entry_array_offset),
1365 le64toh(f->header->n_entries),
1367 test_object_realtime,
1372 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1379 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1383 if (le64toh(o->entry.monotonic) == needle)
1385 else if (le64toh(o->entry.monotonic) < needle)
1391 int journal_file_move_to_entry_by_monotonic(
1395 direction_t direction,
1399 char t[8+32+1] = "_BOOT_ID=";
1403 sd_id128_to_string(boot_id, t + 8);
1405 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1411 return generic_array_bisect_plus_one(f,
1412 le64toh(o->data.entry_offset),
1413 le64toh(o->data.entry_array_offset),
1414 le64toh(o->data.n_entries),
1416 test_object_monotonic,
1421 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1427 else if (p < needle)
1433 int journal_file_next_entry(
1435 Object *o, uint64_t p,
1436 direction_t direction,
1437 Object **ret, uint64_t *offset) {
1443 assert(p > 0 || !o);
1445 n = le64toh(f->header->n_entries);
1450 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1452 if (o->object.type != OBJECT_ENTRY)
1455 r = generic_array_bisect(f,
1456 le64toh(f->header->entry_array_offset),
1457 le64toh(f->header->n_entries),
1466 if (direction == DIRECTION_DOWN) {
1479 /* And jump to it */
1480 return generic_array_get(f,
1481 le64toh(f->header->entry_array_offset),
1486 int journal_file_skip_entry(
1488 Object *o, uint64_t p,
1490 Object **ret, uint64_t *offset) {
1499 if (o->object.type != OBJECT_ENTRY)
1502 r = generic_array_bisect(f,
1503 le64toh(f->header->entry_array_offset),
1504 le64toh(f->header->n_entries),
1513 /* Calculate new index */
1515 if ((uint64_t) -skip >= i)
1518 i = i - (uint64_t) -skip;
1520 i += (uint64_t) skip;
1522 n = le64toh(f->header->n_entries);
1529 return generic_array_get(f,
1530 le64toh(f->header->entry_array_offset),
1535 int journal_file_next_entry_for_data(
1537 Object *o, uint64_t p,
1538 uint64_t data_offset,
1539 direction_t direction,
1540 Object **ret, uint64_t *offset) {
1547 assert(p > 0 || !o);
1549 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1553 n = le64toh(d->data.n_entries);
1558 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1560 if (o->object.type != OBJECT_ENTRY)
1563 r = generic_array_bisect_plus_one(f,
1564 le64toh(d->data.entry_offset),
1565 le64toh(d->data.entry_array_offset),
1566 le64toh(d->data.n_entries),
1576 if (direction == DIRECTION_DOWN) {
1590 return generic_array_get_plus_one(f,
1591 le64toh(d->data.entry_offset),
1592 le64toh(d->data.entry_array_offset),
1597 int journal_file_move_to_entry_by_seqnum_for_data(
1599 uint64_t data_offset,
1601 direction_t direction,
1602 Object **ret, uint64_t *offset) {
1607 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1611 return generic_array_bisect_plus_one(f,
1612 le64toh(d->data.entry_offset),
1613 le64toh(d->data.entry_array_offset),
1614 le64toh(d->data.n_entries),
1621 int journal_file_move_to_entry_by_realtime_for_data(
1623 uint64_t data_offset,
1625 direction_t direction,
1626 Object **ret, uint64_t *offset) {
1631 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1635 return generic_array_bisect_plus_one(f,
1636 le64toh(d->data.entry_offset),
1637 le64toh(d->data.entry_array_offset),
1638 le64toh(d->data.n_entries),
1640 test_object_realtime,
1645 void journal_file_dump(JournalFile *f) {
1646 char a[33], b[33], c[33];
1653 printf("File Path: %s\n"
1657 "Arena size: %llu\n"
1661 sd_id128_to_string(f->header->file_id, a),
1662 sd_id128_to_string(f->header->machine_id, b),
1663 sd_id128_to_string(f->header->boot_id, c),
1664 (unsigned long long) le64toh(f->header->arena_size),
1665 (unsigned long) le64toh(f->header->n_objects),
1666 (unsigned long) le64toh(f->header->n_entries));
1668 p = le64toh(f->header->arena_offset);
1670 r = journal_file_move_to_object(f, -1, p, &o);
1674 switch (o->object.type) {
1677 printf("Type: OBJECT_UNUSED\n");
1681 printf("Type: OBJECT_DATA\n");
1685 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1686 (unsigned long long) le64toh(o->entry.seqnum),
1687 (unsigned long long) le64toh(o->entry.monotonic),
1688 (unsigned long long) le64toh(o->entry.realtime));
1691 case OBJECT_FIELD_HASH_TABLE:
1692 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1695 case OBJECT_DATA_HASH_TABLE:
1696 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1699 case OBJECT_ENTRY_ARRAY:
1700 printf("Type: OBJECT_ENTRY_ARRAY\n");
1704 if (o->object.flags & OBJECT_COMPRESSED)
1705 printf("Flags: COMPRESSED\n");
1707 if (p == le64toh(f->header->tail_object_offset))
1710 p = p + ALIGN64(le64toh(o->object.size));
1715 log_error("File corrupt");
1718 int journal_file_open(
1722 JournalFile *template,
1723 JournalFile **ret) {
1727 bool newly_created = false;
1731 if ((flags & O_ACCMODE) != O_RDONLY &&
1732 (flags & O_ACCMODE) != O_RDWR)
1735 if (!endswith(fname, ".journal"))
1738 f = new0(JournalFile, 1);
1745 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1746 f->prot = prot_from_flags(flags);
1749 f->metrics = template->metrics;
1750 f->compress = template->compress;
1753 f->path = strdup(fname);
1759 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1765 if (fstat(f->fd, &f->last_stat) < 0) {
1770 if (f->last_stat.st_size == 0 && f->writable) {
1771 newly_created = true;
1773 r = journal_file_init_header(f, template);
1777 if (fstat(f->fd, &f->last_stat) < 0) {
1783 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1788 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1789 if (f->header == MAP_FAILED) {
1795 if (!newly_created) {
1796 r = journal_file_verify_header(f);
1802 r = journal_file_refresh_header(f);
1807 if (newly_created) {
1809 r = journal_file_setup_field_hash_table(f);
1813 r = journal_file_setup_data_hash_table(f);
1818 r = journal_file_map_field_hash_table(f);
1822 r = journal_file_map_data_hash_table(f);
1832 journal_file_close(f);
1837 int journal_file_rotate(JournalFile **f) {
1840 JournalFile *old_file, *new_file = NULL;
1848 if (!old_file->writable)
1851 if (!endswith(old_file->path, ".journal"))
1854 l = strlen(old_file->path);
1856 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1860 memcpy(p, old_file->path, l - 8);
1862 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1863 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1864 "-%016llx-%016llx.journal",
1865 (unsigned long long) le64toh((*f)->header->seqnum),
1866 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1868 r = rename(old_file->path, p);
1874 old_file->header->state = STATE_ARCHIVED;
1876 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1877 journal_file_close(old_file);
1883 int journal_file_open_reliably(
1887 JournalFile *template,
1888 JournalFile **ret) {
1894 r = journal_file_open(fname, flags, mode, template, ret);
1895 if (r != -EBADMSG && /* corrupted */
1896 r != -ENODATA && /* truncated */
1897 r != -EHOSTDOWN && /* other machine */
1898 r != -EPROTONOSUPPORT) /* incompatible feature */
1901 if ((flags & O_ACCMODE) == O_RDONLY)
1904 if (!(flags & O_CREAT))
1907 /* The file is corrupted. Rotate it away and try it again (but only once) */
1910 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1912 (unsigned long long) now(CLOCK_REALTIME),
1916 r = rename(fname, p);
1921 log_warning("File %s corrupted, renaming and replacing.", fname);
1923 return journal_file_open(fname, flags, mode, template, ret);
1926 struct vacuum_info {
1931 sd_id128_t seqnum_id;
1937 static int vacuum_compare(const void *_a, const void *_b) {
1938 const struct vacuum_info *a, *b;
1943 if (a->have_seqnum && b->have_seqnum &&
1944 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1945 if (a->seqnum < b->seqnum)
1947 else if (a->seqnum > b->seqnum)
1953 if (a->realtime < b->realtime)
1955 else if (a->realtime > b->realtime)
1957 else if (a->have_seqnum && b->have_seqnum)
1958 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1960 return strcmp(a->filename, b->filename);
1963 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1966 struct vacuum_info *list = NULL;
1967 unsigned n_list = 0, n_allocated = 0, i;
1975 d = opendir(directory);
1981 struct dirent buf, *de;
1985 unsigned long long seqnum = 0, realtime;
1986 sd_id128_t seqnum_id;
1989 k = readdir_r(d, &buf, &de);
1998 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2001 if (!S_ISREG(st.st_mode))
2004 q = strlen(de->d_name);
2006 if (endswith(de->d_name, ".journal")) {
2008 /* Vacuum archived files */
2010 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2013 if (de->d_name[q-8-16-1] != '-' ||
2014 de->d_name[q-8-16-1-16-1] != '-' ||
2015 de->d_name[q-8-16-1-16-1-32-1] != '@')
2018 p = strdup(de->d_name);
2024 de->d_name[q-8-16-1-16-1] = 0;
2025 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2030 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2037 } else if (endswith(de->d_name, ".journal~")) {
2038 unsigned long long tmp;
2040 /* Vacuum corrupted files */
2042 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2045 if (de->d_name[q-1-8-16-1] != '-' ||
2046 de->d_name[q-1-8-16-1-16-1] != '@')
2049 p = strdup(de->d_name);
2055 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2060 have_seqnum = false;
2064 if (n_list >= n_allocated) {
2065 struct vacuum_info *j;
2067 n_allocated = MAX(n_allocated * 2U, 8U);
2068 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2078 list[n_list].filename = p;
2079 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2080 list[n_list].seqnum = seqnum;
2081 list[n_list].realtime = realtime;
2082 list[n_list].seqnum_id = seqnum_id;
2083 list[n_list].have_seqnum = have_seqnum;
2085 sum += list[n_list].usage;
2090 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2092 for(i = 0; i < n_list; i++) {
2095 if (fstatvfs(dirfd(d), &ss) < 0) {
2100 if (sum <= max_use &&
2101 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2104 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2105 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2106 sum -= list[i].usage;
2107 } else if (errno != ENOENT)
2108 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2112 for (i = 0; i < n_list; i++)
2113 free(list[i].filename);
2123 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2125 uint64_t q, xor_hash = 0;
2138 ts.monotonic = le64toh(o->entry.monotonic);
2139 ts.realtime = le64toh(o->entry.realtime);
2141 if (to->tail_entry_monotonic_valid &&
2142 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2145 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2148 n = journal_file_entry_n_items(o);
2149 items = alloca(sizeof(EntryItem) * n);
2151 for (i = 0; i < n; i++) {
2158 q = le64toh(o->entry.items[i].object_offset);
2159 le_hash = o->entry.items[i].hash;
2161 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2165 if (le_hash != o->data.hash)
2168 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2171 /* We hit the limit on 32bit machines */
2172 if ((uint64_t) t != l)
2175 if (o->object.flags & OBJECT_COMPRESSED) {
2179 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2182 data = from->compress_buffer;
2185 return -EPROTONOSUPPORT;
2188 data = o->data.payload;
2190 r = journal_file_append_data(to, data, l, &u, &h);
2194 xor_hash ^= le64toh(u->data.hash);
2195 items[i].object_offset = htole64(h);
2196 items[i].hash = u->data.hash;
2198 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2203 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2206 void journal_default_metrics(JournalMetrics *m, int fd) {
2207 uint64_t fs_size = 0;
2209 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2214 if (fstatvfs(fd, &ss) >= 0)
2215 fs_size = ss.f_frsize * ss.f_blocks;
2217 if (m->max_use == (uint64_t) -1) {
2220 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2222 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2223 m->max_use = DEFAULT_MAX_USE_UPPER;
2225 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2226 m->max_use = DEFAULT_MAX_USE_LOWER;
2228 m->max_use = DEFAULT_MAX_USE_LOWER;
2230 m->max_use = PAGE_ALIGN(m->max_use);
2232 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2233 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2236 if (m->max_size == (uint64_t) -1) {
2237 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2239 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2240 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2242 m->max_size = PAGE_ALIGN(m->max_size);
2244 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2245 m->max_size = JOURNAL_FILE_SIZE_MIN;
2247 if (m->max_size*2 > m->max_use)
2248 m->max_use = m->max_size*2;
2250 if (m->min_size == (uint64_t) -1)
2251 m->min_size = JOURNAL_FILE_SIZE_MIN;
2253 m->min_size = PAGE_ALIGN(m->min_size);
2255 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2256 m->min_size = JOURNAL_FILE_SIZE_MIN;
2258 if (m->min_size > m->max_size)
2259 m->max_size = m->min_size;
2262 if (m->keep_free == (uint64_t) -1) {
2265 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2267 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2268 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2271 m->keep_free = DEFAULT_KEEP_FREE;
2274 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2275 format_bytes(a, sizeof(a), m->max_use),
2276 format_bytes(b, sizeof(b), m->max_size),
2277 format_bytes(c, sizeof(c), m->min_size),
2278 format_bytes(d, sizeof(d), m->keep_free));