1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65 void journal_file_close(JournalFile *f) {
70 if (f->header && f->writable)
71 f->header->state = STATE_OFFLINE;
74 for (t = 0; t < _WINDOW_MAX; t++)
75 if (f->windows[t].ptr)
76 munmap(f->windows[t].ptr, f->windows[t].size);
79 close_nointr_nofail(f->fd);
84 free(f->compress_buffer);
90 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
98 memcpy(h.signature, signature, 8);
99 h.arena_offset = htole64(ALIGN64(sizeof(h)));
101 r = sd_id128_randomize(&h.file_id);
106 h.seqnum_id = template->header->seqnum_id;
107 h.seqnum = template->header->seqnum;
109 h.seqnum_id = h.file_id;
111 k = pwrite(f->fd, &h, sizeof(h), 0);
121 static int journal_file_refresh_header(JournalFile *f) {
127 r = sd_id128_get_machine(&f->header->machine_id);
131 r = sd_id128_get_boot(&boot_id);
135 if (sd_id128_equal(boot_id, f->header->boot_id))
136 f->tail_entry_monotonic_valid = true;
138 f->header->boot_id = boot_id;
140 f->header->state = STATE_ONLINE;
142 __sync_synchronize();
147 static int journal_file_verify_header(JournalFile *f) {
150 if (memcmp(f->header, signature, 8))
154 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
155 return -EPROTONOSUPPORT;
157 if (f->header->incompatible_flags != 0)
158 return -EPROTONOSUPPORT;
161 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
166 sd_id128_t machine_id;
169 r = sd_id128_get_machine(&machine_id);
173 if (!sd_id128_equal(machine_id, f->header->machine_id))
176 state = f->header->state;
178 if (state == STATE_ONLINE)
179 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180 else if (state == STATE_ARCHIVED)
182 else if (state != STATE_OFFLINE)
183 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
189 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190 uint64_t old_size, new_size;
194 /* We assume that this file is not sparse, and we know that
195 * for sure, since we always call posix_fallocate()
199 le64toh(f->header->arena_offset) +
200 le64toh(f->header->arena_size);
202 new_size = PAGE_ALIGN(offset + size);
203 if (new_size < le64toh(f->header->arena_offset))
204 new_size = le64toh(f->header->arena_offset);
206 if (new_size <= old_size)
209 if (f->metrics.max_size > 0 &&
210 new_size > f->metrics.max_size)
213 if (new_size > f->metrics.min_size &&
214 f->metrics.keep_free > 0) {
217 if (fstatvfs(f->fd, &svfs) >= 0) {
220 available = svfs.f_bfree * svfs.f_bsize;
222 if (available >= f->metrics.keep_free)
223 available -= f->metrics.keep_free;
227 if (new_size - old_size > available)
232 /* Note that the glibc fallocate() fallback is very
233 inefficient, hence we try to minimize the allocation area
235 if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
238 if (fstat(f->fd, &f->last_stat) < 0)
241 f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
246 static int journal_file_map(
255 uint64_t woffset, wsize;
262 woffset = offset & ~((uint64_t) page_size() - 1ULL);
263 wsize = size + (offset - woffset);
264 wsize = PAGE_ALIGN(wsize);
266 /* Avoid SIGBUS on invalid accesses */
267 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
268 return -EADDRNOTAVAIL;
270 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
271 if (window == MAP_FAILED)
283 *ret = (uint8_t*) window + (offset - woffset);
288 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
297 assert(wt < _WINDOW_MAX);
299 if (offset + size > (uint64_t) f->last_stat.st_size) {
300 /* Hmm, out of range? Let's refresh the fstat() data
301 * first, before we trust that check. */
303 if (fstat(f->fd, &f->last_stat) < 0 ||
304 offset + size > (uint64_t) f->last_stat.st_size)
305 return -EADDRNOTAVAIL;
310 if (_likely_(w->ptr &&
311 w->offset <= offset &&
312 w->offset + w->size >= offset + size)) {
314 *ret = (uint8_t*) w->ptr + (offset - w->offset);
319 if (munmap(w->ptr, w->size) < 0)
323 w->size = w->offset = 0;
326 if (size < DEFAULT_WINDOW_SIZE) {
327 /* If the default window size is larger then what was
328 * asked for extend the mapping a bit in the hope to
329 * minimize needed remappings later on. We add half
330 * the window space before and half behind the
331 * requested mapping */
333 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
339 size = DEFAULT_WINDOW_SIZE;
343 if (offset + size > (uint64_t) f->last_stat.st_size)
344 size = (uint64_t) f->last_stat.st_size - offset;
347 return -EADDRNOTAVAIL;
349 r = journal_file_map(f,
351 &w->ptr, &w->offset, &w->size,
357 *ret = (uint8_t*) p + delta;
361 static bool verify_hash(Object *o) {
366 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
367 h1 = le64toh(o->data.hash);
368 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
369 } else if (o->object.type == OBJECT_FIELD) {
370 h1 = le64toh(o->field.hash);
371 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
378 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
386 assert(type < _OBJECT_TYPE_MAX);
388 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
393 s = le64toh(o->object.size);
395 if (s < sizeof(ObjectHeader))
398 if (type >= 0 && o->object.type != type)
401 if (s > sizeof(ObjectHeader)) {
402 r = journal_file_move_to(f, o->object.type, offset, s, &t);
416 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
421 r = le64toh(f->header->seqnum) + 1;
424 /* If an external seqnum counter was passed, we update
425 * both the local and the external one, and set it to
426 * the maximum of both */
434 f->header->seqnum = htole64(r);
436 if (f->header->first_seqnum == 0)
437 f->header->first_seqnum = htole64(r);
442 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
449 assert(size >= sizeof(ObjectHeader));
453 p = le64toh(f->header->tail_object_offset);
455 p = le64toh(f->header->arena_offset);
457 r = journal_file_move_to_object(f, -1, p, &tail);
461 p += ALIGN64(le64toh(tail->object.size));
464 r = journal_file_allocate(f, p, size);
468 r = journal_file_move_to(f, type, p, size, &t);
475 o->object.type = type;
476 o->object.size = htole64(size);
478 f->header->tail_object_offset = htole64(p);
479 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
487 static int journal_file_setup_data_hash_table(JournalFile *f) {
494 s = DEFAULT_DATA_HASH_TABLE_SIZE;
495 r = journal_file_append_object(f,
496 OBJECT_DATA_HASH_TABLE,
497 offsetof(Object, hash_table.items) + s,
502 memset(o->hash_table.items, 0, s);
504 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505 f->header->data_hash_table_size = htole64(s);
510 static int journal_file_setup_field_hash_table(JournalFile *f) {
517 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518 r = journal_file_append_object(f,
519 OBJECT_FIELD_HASH_TABLE,
520 offsetof(Object, hash_table.items) + s,
525 memset(o->hash_table.items, 0, s);
527 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528 f->header->field_hash_table_size = htole64(s);
533 static int journal_file_map_data_hash_table(JournalFile *f) {
540 p = le64toh(f->header->data_hash_table_offset);
541 s = le64toh(f->header->data_hash_table_size);
543 r = journal_file_move_to(f,
544 WINDOW_DATA_HASH_TABLE,
550 f->data_hash_table = t;
554 static int journal_file_map_field_hash_table(JournalFile *f) {
561 p = le64toh(f->header->field_hash_table_offset);
562 s = le64toh(f->header->field_hash_table_size);
564 r = journal_file_move_to(f,
565 WINDOW_FIELD_HASH_TABLE,
571 f->field_hash_table = t;
575 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
582 assert(o->object.type == OBJECT_DATA);
584 /* This might alter the window we are looking at */
586 o->data.next_hash_offset = o->data.next_field_offset = 0;
587 o->data.entry_offset = o->data.entry_array_offset = 0;
588 o->data.n_entries = 0;
590 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
591 p = le64toh(f->data_hash_table[h].head_hash_offset);
593 /* Only entry in the hash table is easy */
594 f->data_hash_table[h].head_hash_offset = htole64(offset);
596 /* Move back to the previous data object, to patch in
599 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
603 o->data.next_hash_offset = htole64(offset);
606 f->data_hash_table[h].tail_hash_offset = htole64(offset);
611 int journal_file_find_data_object_with_hash(
613 const void *data, uint64_t size, uint64_t hash,
614 Object **ret, uint64_t *offset) {
616 uint64_t p, osize, h;
620 assert(data || size == 0);
622 osize = offsetof(Object, data.payload) + size;
624 if (f->header->data_hash_table_size == 0)
627 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
628 p = le64toh(f->data_hash_table[h].head_hash_offset);
633 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
637 if (le64toh(o->data.hash) != hash)
640 if (o->object.flags & OBJECT_COMPRESSED) {
644 l = le64toh(o->object.size);
645 if (l <= offsetof(Object, data.payload))
648 l -= offsetof(Object, data.payload);
650 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
654 memcmp(f->compress_buffer, data, size) == 0) {
665 return -EPROTONOSUPPORT;
668 } else if (le64toh(o->object.size) == osize &&
669 memcmp(o->data.payload, data, size) == 0) {
681 p = le64toh(o->data.next_hash_offset);
687 int journal_file_find_data_object(
689 const void *data, uint64_t size,
690 Object **ret, uint64_t *offset) {
695 assert(data || size == 0);
697 hash = hash64(data, size);
699 return journal_file_find_data_object_with_hash(f,
704 static int journal_file_append_data(
706 const void *data, uint64_t size,
707 Object **ret, uint64_t *offset) {
713 bool compressed = false;
716 assert(data || size == 0);
718 hash = hash64(data, size);
720 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
734 osize = offsetof(Object, data.payload) + size;
735 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
739 o->data.hash = htole64(hash);
743 size >= COMPRESSION_SIZE_THRESHOLD) {
746 compressed = compress_blob(data, size, o->data.payload, &rsize);
749 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
750 o->object.flags |= OBJECT_COMPRESSED;
752 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
754 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
760 memcpy(o->data.payload, data, size);
762 r = journal_file_link_data(f, o, p, hash);
766 /* The linking might have altered the window, so let's
767 * refresh our pointer */
768 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
781 uint64_t journal_file_entry_n_items(Object *o) {
783 assert(o->object.type == OBJECT_ENTRY);
785 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
788 static uint64_t journal_file_entry_array_n_items(Object *o) {
790 assert(o->object.type == OBJECT_ENTRY_ARRAY);
792 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
795 static int link_entry_into_array(JournalFile *f,
800 uint64_t n = 0, ap = 0, q, i, a, hidx;
809 i = hidx = le64toh(*idx);
812 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
816 n = journal_file_entry_array_n_items(o);
818 o->entry_array.items[i] = htole64(p);
819 *idx = htole64(hidx + 1);
825 a = le64toh(o->entry_array.next_entry_array_offset);
836 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
837 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
842 o->entry_array.items[i] = htole64(p);
847 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
851 o->entry_array.next_entry_array_offset = htole64(q);
854 *idx = htole64(hidx + 1);
859 static int link_entry_into_array_plus_one(JournalFile *f,
878 i = htole64(le64toh(*idx) - 1);
879 r = link_entry_into_array(f, first, &i, p);
884 *idx = htole64(le64toh(*idx) + 1);
888 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
895 p = le64toh(o->entry.items[i].object_offset);
899 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
903 return link_entry_into_array_plus_one(f,
904 &o->data.entry_offset,
905 &o->data.entry_array_offset,
910 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
917 assert(o->object.type == OBJECT_ENTRY);
919 __sync_synchronize();
921 /* Link up the entry itself */
922 r = link_entry_into_array(f,
923 &f->header->entry_array_offset,
924 &f->header->n_entries,
929 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
931 if (f->header->head_entry_realtime == 0)
932 f->header->head_entry_realtime = o->entry.realtime;
934 f->header->tail_entry_realtime = o->entry.realtime;
935 f->header->tail_entry_monotonic = o->entry.monotonic;
937 f->tail_entry_monotonic_valid = true;
939 /* Link up the items */
940 n = journal_file_entry_n_items(o);
941 for (i = 0; i < n; i++) {
942 r = journal_file_link_entry_item(f, o, offset, i);
950 static int journal_file_append_entry_internal(
952 const dual_timestamp *ts,
954 const EntryItem items[], unsigned n_items,
956 Object **ret, uint64_t *offset) {
963 assert(items || n_items == 0);
966 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
968 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
972 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
973 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
974 o->entry.realtime = htole64(ts->realtime);
975 o->entry.monotonic = htole64(ts->monotonic);
976 o->entry.xor_hash = htole64(xor_hash);
977 o->entry.boot_id = f->header->boot_id;
979 r = journal_file_link_entry(f, o, np);
992 void journal_file_post_change(JournalFile *f) {
995 /* inotify() does not receive IN_MODIFY events from file
996 * accesses done via mmap(). After each access we hence
997 * trigger IN_MODIFY by truncating the journal file to its
998 * current size which triggers IN_MODIFY. */
1000 __sync_synchronize();
1002 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1003 log_error("Failed to to truncate file to its own size: %m");
1006 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1010 uint64_t xor_hash = 0;
1011 struct dual_timestamp _ts;
1014 assert(iovec || n_iovec == 0);
1020 dual_timestamp_get(&_ts);
1024 if (f->tail_entry_monotonic_valid &&
1025 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1028 items = alloca(sizeof(EntryItem) * n_iovec);
1030 for (i = 0; i < n_iovec; i++) {
1034 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1038 xor_hash ^= le64toh(o->data.hash);
1039 items[i].object_offset = htole64(p);
1040 items[i].hash = o->data.hash;
1043 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1045 journal_file_post_change(f);
1050 static int generic_array_get(JournalFile *f,
1053 Object **ret, uint64_t *offset) {
1065 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1069 n = journal_file_entry_array_n_items(o);
1071 p = le64toh(o->entry_array.items[i]);
1076 a = le64toh(o->entry_array.next_entry_array_offset);
1079 if (a <= 0 || p <= 0)
1082 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1095 static int generic_array_get_plus_one(JournalFile *f,
1099 Object **ret, uint64_t *offset) {
1108 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1121 return generic_array_get(f, first, i-1, ret, offset);
1130 static int generic_array_bisect(JournalFile *f,
1134 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1135 direction_t direction,
1140 uint64_t a, p, t = 0, i = 0, last_p = 0;
1141 bool subtract_one = false;
1142 Object *o, *array = NULL;
1146 assert(test_object);
1150 uint64_t left, right, k, lp;
1152 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1156 k = journal_file_entry_array_n_items(array);
1162 lp = p = le64toh(array->entry_array.items[i]);
1166 r = test_object(f, p, needle);
1170 if (r == TEST_FOUND)
1171 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1173 if (r == TEST_RIGHT) {
1177 if (left == right) {
1178 if (direction == DIRECTION_UP)
1179 subtract_one = true;
1185 assert(left < right);
1187 i = (left + right) / 2;
1188 p = le64toh(array->entry_array.items[i]);
1192 r = test_object(f, p, needle);
1196 if (r == TEST_FOUND)
1197 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1199 if (r == TEST_RIGHT)
1213 a = le64toh(array->entry_array.next_entry_array_offset);
1219 if (subtract_one && t == 0 && i == 0)
1222 if (subtract_one && i == 0)
1224 else if (subtract_one)
1225 p = le64toh(array->entry_array.items[i-1]);
1227 p = le64toh(array->entry_array.items[i]);
1229 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1240 *idx = t + i - (subtract_one ? 1 : 0);
1245 static int generic_array_bisect_plus_one(JournalFile *f,
1250 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1251 direction_t direction,
1259 assert(test_object);
1264 /* This bisects the array in object 'first', but first checks
1266 r = test_object(f, extra, needle);
1269 else if (r == TEST_FOUND) {
1272 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1286 } else if (r == TEST_RIGHT)
1289 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1297 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1304 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1308 if (le64toh(o->entry.seqnum) == needle)
1310 else if (le64toh(o->entry.seqnum) < needle)
1316 int journal_file_move_to_entry_by_seqnum(
1319 direction_t direction,
1323 return generic_array_bisect(f,
1324 le64toh(f->header->entry_array_offset),
1325 le64toh(f->header->n_entries),
1332 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1339 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1343 if (le64toh(o->entry.realtime) == needle)
1345 else if (le64toh(o->entry.realtime) < needle)
1351 int journal_file_move_to_entry_by_realtime(
1354 direction_t direction,
1358 return generic_array_bisect(f,
1359 le64toh(f->header->entry_array_offset),
1360 le64toh(f->header->n_entries),
1362 test_object_realtime,
1367 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1374 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1378 if (le64toh(o->entry.monotonic) == needle)
1380 else if (le64toh(o->entry.monotonic) < needle)
1386 int journal_file_move_to_entry_by_monotonic(
1390 direction_t direction,
1394 char t[8+32+1] = "_BOOT_ID=";
1398 sd_id128_to_string(boot_id, t + 8);
1400 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1406 return generic_array_bisect_plus_one(f,
1407 le64toh(o->data.entry_offset),
1408 le64toh(o->data.entry_array_offset),
1409 le64toh(o->data.n_entries),
1411 test_object_monotonic,
1416 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1422 else if (p < needle)
1428 int journal_file_next_entry(
1430 Object *o, uint64_t p,
1431 direction_t direction,
1432 Object **ret, uint64_t *offset) {
1438 assert(p > 0 || !o);
1440 n = le64toh(f->header->n_entries);
1445 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1447 if (o->object.type != OBJECT_ENTRY)
1450 r = generic_array_bisect(f,
1451 le64toh(f->header->entry_array_offset),
1452 le64toh(f->header->n_entries),
1461 if (direction == DIRECTION_DOWN) {
1474 /* And jump to it */
1475 return generic_array_get(f,
1476 le64toh(f->header->entry_array_offset),
1481 int journal_file_skip_entry(
1483 Object *o, uint64_t p,
1485 Object **ret, uint64_t *offset) {
1494 if (o->object.type != OBJECT_ENTRY)
1497 r = generic_array_bisect(f,
1498 le64toh(f->header->entry_array_offset),
1499 le64toh(f->header->n_entries),
1508 /* Calculate new index */
1510 if ((uint64_t) -skip >= i)
1513 i = i - (uint64_t) -skip;
1515 i += (uint64_t) skip;
1517 n = le64toh(f->header->n_entries);
1524 return generic_array_get(f,
1525 le64toh(f->header->entry_array_offset),
1530 int journal_file_next_entry_for_data(
1532 Object *o, uint64_t p,
1533 uint64_t data_offset,
1534 direction_t direction,
1535 Object **ret, uint64_t *offset) {
1542 assert(p > 0 || !o);
1544 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1548 n = le64toh(d->data.n_entries);
1553 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1555 if (o->object.type != OBJECT_ENTRY)
1558 r = generic_array_bisect_plus_one(f,
1559 le64toh(d->data.entry_offset),
1560 le64toh(d->data.entry_array_offset),
1561 le64toh(d->data.n_entries),
1571 if (direction == DIRECTION_DOWN) {
1585 return generic_array_get_plus_one(f,
1586 le64toh(d->data.entry_offset),
1587 le64toh(d->data.entry_array_offset),
1592 int journal_file_move_to_entry_by_seqnum_for_data(
1594 uint64_t data_offset,
1596 direction_t direction,
1597 Object **ret, uint64_t *offset) {
1602 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1606 return generic_array_bisect_plus_one(f,
1607 le64toh(d->data.entry_offset),
1608 le64toh(d->data.entry_array_offset),
1609 le64toh(d->data.n_entries),
1616 int journal_file_move_to_entry_by_realtime_for_data(
1618 uint64_t data_offset,
1620 direction_t direction,
1621 Object **ret, uint64_t *offset) {
1626 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1630 return generic_array_bisect_plus_one(f,
1631 le64toh(d->data.entry_offset),
1632 le64toh(d->data.entry_array_offset),
1633 le64toh(d->data.n_entries),
1635 test_object_realtime,
1640 void journal_file_dump(JournalFile *f) {
1641 char a[33], b[33], c[33];
1648 printf("File Path: %s\n"
1652 "Arena size: %llu\n"
1656 sd_id128_to_string(f->header->file_id, a),
1657 sd_id128_to_string(f->header->machine_id, b),
1658 sd_id128_to_string(f->header->boot_id, c),
1659 (unsigned long long) le64toh(f->header->arena_size),
1660 (unsigned long) le64toh(f->header->n_objects),
1661 (unsigned long) le64toh(f->header->n_entries));
1663 p = le64toh(f->header->arena_offset);
1665 r = journal_file_move_to_object(f, -1, p, &o);
1669 switch (o->object.type) {
1672 printf("Type: OBJECT_UNUSED\n");
1676 printf("Type: OBJECT_DATA\n");
1680 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1681 (unsigned long long) le64toh(o->entry.seqnum),
1682 (unsigned long long) le64toh(o->entry.monotonic),
1683 (unsigned long long) le64toh(o->entry.realtime));
1686 case OBJECT_FIELD_HASH_TABLE:
1687 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1690 case OBJECT_DATA_HASH_TABLE:
1691 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1694 case OBJECT_ENTRY_ARRAY:
1695 printf("Type: OBJECT_ENTRY_ARRAY\n");
1699 if (o->object.flags & OBJECT_COMPRESSED)
1700 printf("Flags: COMPRESSED\n");
1702 if (p == le64toh(f->header->tail_object_offset))
1705 p = p + ALIGN64(le64toh(o->object.size));
1710 log_error("File corrupt");
1713 int journal_file_open(
1717 JournalFile *template,
1718 JournalFile **ret) {
1722 bool newly_created = false;
1726 if ((flags & O_ACCMODE) != O_RDONLY &&
1727 (flags & O_ACCMODE) != O_RDWR)
1730 if (!endswith(fname, ".journal"))
1733 f = new0(JournalFile, 1);
1740 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1741 f->prot = prot_from_flags(flags);
1744 f->metrics = template->metrics;
1745 f->compress = template->compress;
1748 f->path = strdup(fname);
1754 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1760 if (fstat(f->fd, &f->last_stat) < 0) {
1765 if (f->last_stat.st_size == 0 && f->writable) {
1766 newly_created = true;
1768 r = journal_file_init_header(f, template);
1772 if (fstat(f->fd, &f->last_stat) < 0) {
1778 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1783 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1784 if (f->header == MAP_FAILED) {
1790 if (!newly_created) {
1791 r = journal_file_verify_header(f);
1797 r = journal_file_refresh_header(f);
1802 if (newly_created) {
1804 r = journal_file_setup_field_hash_table(f);
1808 r = journal_file_setup_data_hash_table(f);
1813 r = journal_file_map_field_hash_table(f);
1817 r = journal_file_map_data_hash_table(f);
1827 journal_file_close(f);
1832 int journal_file_rotate(JournalFile **f) {
1835 JournalFile *old_file, *new_file = NULL;
1843 if (!old_file->writable)
1846 if (!endswith(old_file->path, ".journal"))
1849 l = strlen(old_file->path);
1851 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1855 memcpy(p, old_file->path, l - 8);
1857 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1858 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1859 "-%016llx-%016llx.journal",
1860 (unsigned long long) le64toh((*f)->header->seqnum),
1861 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1863 r = rename(old_file->path, p);
1869 old_file->header->state = STATE_ARCHIVED;
1871 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1872 journal_file_close(old_file);
1878 int journal_file_open_reliably(
1882 JournalFile *template,
1883 JournalFile **ret) {
1889 r = journal_file_open(fname, flags, mode, template, ret);
1890 if (r != -EBADMSG && /* corrupted */
1891 r != -ENODATA && /* truncated */
1892 r != -EHOSTDOWN && /* other machine */
1893 r != -EPROTONOSUPPORT) /* incompatible feature */
1896 if ((flags & O_ACCMODE) == O_RDONLY)
1899 if (!(flags & O_CREAT))
1902 /* The file is corrupted. Rotate it away and try it again (but only once) */
1905 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1907 (unsigned long long) now(CLOCK_REALTIME),
1911 r = rename(fname, p);
1916 log_warning("File %s corrupted, renaming and replacing.", fname);
1918 return journal_file_open(fname, flags, mode, template, ret);
1921 struct vacuum_info {
1926 sd_id128_t seqnum_id;
1932 static int vacuum_compare(const void *_a, const void *_b) {
1933 const struct vacuum_info *a, *b;
1938 if (a->have_seqnum && b->have_seqnum &&
1939 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1940 if (a->seqnum < b->seqnum)
1942 else if (a->seqnum > b->seqnum)
1948 if (a->realtime < b->realtime)
1950 else if (a->realtime > b->realtime)
1952 else if (a->have_seqnum && b->have_seqnum)
1953 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1955 return strcmp(a->filename, b->filename);
1958 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1961 struct vacuum_info *list = NULL;
1962 unsigned n_list = 0, n_allocated = 0, i;
1970 d = opendir(directory);
1976 struct dirent buf, *de;
1980 unsigned long long seqnum = 0, realtime;
1981 sd_id128_t seqnum_id;
1984 k = readdir_r(d, &buf, &de);
1993 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1996 if (!S_ISREG(st.st_mode))
1999 q = strlen(de->d_name);
2001 if (endswith(de->d_name, ".journal")) {
2003 /* Vacuum archived files */
2005 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2008 if (de->d_name[q-8-16-1] != '-' ||
2009 de->d_name[q-8-16-1-16-1] != '-' ||
2010 de->d_name[q-8-16-1-16-1-32-1] != '@')
2013 p = strdup(de->d_name);
2019 de->d_name[q-8-16-1-16-1] = 0;
2020 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2025 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2032 } else if (endswith(de->d_name, ".journal~")) {
2033 unsigned long long tmp;
2035 /* Vacuum corrupted files */
2037 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2040 if (de->d_name[q-1-8-16-1] != '-' ||
2041 de->d_name[q-1-8-16-1-16-1] != '@')
2044 p = strdup(de->d_name);
2050 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2055 have_seqnum = false;
2059 if (n_list >= n_allocated) {
2060 struct vacuum_info *j;
2062 n_allocated = MAX(n_allocated * 2U, 8U);
2063 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2073 list[n_list].filename = p;
2074 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2075 list[n_list].seqnum = seqnum;
2076 list[n_list].realtime = realtime;
2077 list[n_list].seqnum_id = seqnum_id;
2078 list[n_list].have_seqnum = have_seqnum;
2080 sum += list[n_list].usage;
2085 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2087 for(i = 0; i < n_list; i++) {
2090 if (fstatvfs(dirfd(d), &ss) < 0) {
2095 if (sum <= max_use &&
2096 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2099 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2100 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2101 sum -= list[i].usage;
2102 } else if (errno != ENOENT)
2103 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2107 for (i = 0; i < n_list; i++)
2108 free(list[i].filename);
2118 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2120 uint64_t q, xor_hash = 0;
2133 ts.monotonic = le64toh(o->entry.monotonic);
2134 ts.realtime = le64toh(o->entry.realtime);
2136 if (to->tail_entry_monotonic_valid &&
2137 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2140 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2143 n = journal_file_entry_n_items(o);
2144 items = alloca(sizeof(EntryItem) * n);
2146 for (i = 0; i < n; i++) {
2153 q = le64toh(o->entry.items[i].object_offset);
2154 le_hash = o->entry.items[i].hash;
2156 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2160 if (le_hash != o->data.hash)
2163 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2166 /* We hit the limit on 32bit machines */
2167 if ((uint64_t) t != l)
2170 if (o->object.flags & OBJECT_COMPRESSED) {
2174 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2177 data = from->compress_buffer;
2180 return -EPROTONOSUPPORT;
2183 data = o->data.payload;
2185 r = journal_file_append_data(to, data, l, &u, &h);
2189 xor_hash ^= le64toh(u->data.hash);
2190 items[i].object_offset = htole64(h);
2191 items[i].hash = u->data.hash;
2193 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2198 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2201 void journal_default_metrics(JournalMetrics *m, int fd) {
2202 uint64_t fs_size = 0;
2204 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2209 if (fstatvfs(fd, &ss) >= 0)
2210 fs_size = ss.f_frsize * ss.f_blocks;
2212 if (m->max_use == (uint64_t) -1) {
2215 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2217 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2218 m->max_use = DEFAULT_MAX_USE_UPPER;
2220 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2221 m->max_use = DEFAULT_MAX_USE_LOWER;
2223 m->max_use = DEFAULT_MAX_USE_LOWER;
2225 m->max_use = PAGE_ALIGN(m->max_use);
2227 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2228 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2231 if (m->max_size == (uint64_t) -1) {
2232 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2234 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2235 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2237 m->max_size = PAGE_ALIGN(m->max_size);
2239 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2240 m->max_size = JOURNAL_FILE_SIZE_MIN;
2242 if (m->max_size*2 > m->max_use)
2243 m->max_use = m->max_size*2;
2245 if (m->min_size == (uint64_t) -1)
2246 m->min_size = JOURNAL_FILE_SIZE_MIN;
2248 m->min_size = PAGE_ALIGN(m->min_size);
2250 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2251 m->min_size = JOURNAL_FILE_SIZE_MIN;
2253 if (m->min_size > m->max_size)
2254 m->max_size = m->min_size;
2257 if (m->keep_free == (uint64_t) -1) {
2260 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2262 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2263 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2266 m->keep_free = DEFAULT_KEEP_FREE;
2269 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2270 format_bytes(a, sizeof(a), m->max_use),
2271 format_bytes(b, sizeof(b), m->max_size),
2272 format_bytes(c, sizeof(c), m->min_size),
2273 format_bytes(d, sizeof(d), m->keep_free));