1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65 void journal_file_close(JournalFile *f) {
70 if (f->header && f->writable)
71 f->header->state = STATE_OFFLINE;
74 for (t = 0; t < _WINDOW_MAX; t++)
75 if (f->windows[t].ptr)
76 munmap(f->windows[t].ptr, f->windows[t].size);
79 close_nointr_nofail(f->fd);
84 free(f->compress_buffer);
90 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
98 memcpy(h.signature, signature, 8);
99 h.arena_offset = htole64(ALIGN64(sizeof(h)));
101 r = sd_id128_randomize(&h.file_id);
106 h.seqnum_id = template->header->seqnum_id;
107 h.seqnum = template->header->seqnum;
109 h.seqnum_id = h.file_id;
111 k = pwrite(f->fd, &h, sizeof(h), 0);
121 static int journal_file_refresh_header(JournalFile *f) {
127 r = sd_id128_get_machine(&f->header->machine_id);
131 r = sd_id128_get_boot(&boot_id);
135 if (sd_id128_equal(boot_id, f->header->boot_id))
136 f->tail_entry_monotonic_valid = true;
138 f->header->boot_id = boot_id;
140 f->header->state = STATE_ONLINE;
142 __sync_synchronize();
147 static int journal_file_verify_header(JournalFile *f) {
150 if (memcmp(f->header, signature, 8))
154 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
155 return -EPROTONOSUPPORT;
157 if (f->header->incompatible_flags != 0)
158 return -EPROTONOSUPPORT;
161 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
166 sd_id128_t machine_id;
169 r = sd_id128_get_machine(&machine_id);
173 if (!sd_id128_equal(machine_id, f->header->machine_id))
176 state = f->header->state;
178 if (state == STATE_ONLINE)
179 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180 else if (state == STATE_ARCHIVED)
182 else if (state != STATE_OFFLINE)
183 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
189 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190 uint64_t old_size, new_size;
195 /* We assume that this file is not sparse, and we know that
196 * for sure, since we always call posix_fallocate()
200 le64toh(f->header->arena_offset) +
201 le64toh(f->header->arena_size);
203 new_size = PAGE_ALIGN(offset + size);
204 if (new_size < le64toh(f->header->arena_offset))
205 new_size = le64toh(f->header->arena_offset);
207 if (new_size <= old_size)
210 if (f->metrics.max_size > 0 &&
211 new_size > f->metrics.max_size)
214 if (new_size > f->metrics.min_size &&
215 f->metrics.keep_free > 0) {
218 if (fstatvfs(f->fd, &svfs) >= 0) {
221 available = svfs.f_bfree * svfs.f_bsize;
223 if (available >= f->metrics.keep_free)
224 available -= f->metrics.keep_free;
228 if (new_size - old_size > available)
233 /* Note that the glibc fallocate() fallback is very
234 inefficient, hence we try to minimize the allocation area
236 r = posix_fallocate(f->fd, old_size, new_size - old_size);
240 if (fstat(f->fd, &f->last_stat) < 0)
243 f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
248 static int journal_file_map(
257 uint64_t woffset, wsize;
264 woffset = offset & ~((uint64_t) page_size() - 1ULL);
265 wsize = size + (offset - woffset);
266 wsize = PAGE_ALIGN(wsize);
268 /* Avoid SIGBUS on invalid accesses */
269 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
270 return -EADDRNOTAVAIL;
272 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
273 if (window == MAP_FAILED)
285 *ret = (uint8_t*) window + (offset - woffset);
290 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
299 assert(wt < _WINDOW_MAX);
301 if (offset + size > (uint64_t) f->last_stat.st_size) {
302 /* Hmm, out of range? Let's refresh the fstat() data
303 * first, before we trust that check. */
305 if (fstat(f->fd, &f->last_stat) < 0 ||
306 offset + size > (uint64_t) f->last_stat.st_size)
307 return -EADDRNOTAVAIL;
312 if (_likely_(w->ptr &&
313 w->offset <= offset &&
314 w->offset + w->size >= offset + size)) {
316 *ret = (uint8_t*) w->ptr + (offset - w->offset);
321 if (munmap(w->ptr, w->size) < 0)
325 w->size = w->offset = 0;
328 if (size < DEFAULT_WINDOW_SIZE) {
329 /* If the default window size is larger then what was
330 * asked for extend the mapping a bit in the hope to
331 * minimize needed remappings later on. We add half
332 * the window space before and half behind the
333 * requested mapping */
335 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
341 size = DEFAULT_WINDOW_SIZE;
345 if (offset + size > (uint64_t) f->last_stat.st_size)
346 size = (uint64_t) f->last_stat.st_size - offset;
349 return -EADDRNOTAVAIL;
351 r = journal_file_map(f,
353 &w->ptr, &w->offset, &w->size,
359 *ret = (uint8_t*) p + delta;
363 static bool verify_hash(Object *o) {
368 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
369 h1 = le64toh(o->data.hash);
370 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
371 } else if (o->object.type == OBJECT_FIELD) {
372 h1 = le64toh(o->field.hash);
373 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
380 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
388 assert(type < _OBJECT_TYPE_MAX);
390 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
395 s = le64toh(o->object.size);
397 if (s < sizeof(ObjectHeader))
400 if (type >= 0 && o->object.type != type)
403 if (s > sizeof(ObjectHeader)) {
404 r = journal_file_move_to(f, o->object.type, offset, s, &t);
418 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
423 r = le64toh(f->header->seqnum) + 1;
426 /* If an external seqnum counter was passed, we update
427 * both the local and the external one, and set it to
428 * the maximum of both */
436 f->header->seqnum = htole64(r);
438 if (f->header->first_seqnum == 0)
439 f->header->first_seqnum = htole64(r);
444 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
451 assert(size >= sizeof(ObjectHeader));
455 p = le64toh(f->header->tail_object_offset);
457 p = le64toh(f->header->arena_offset);
459 r = journal_file_move_to_object(f, -1, p, &tail);
463 p += ALIGN64(le64toh(tail->object.size));
466 r = journal_file_allocate(f, p, size);
470 r = journal_file_move_to(f, type, p, size, &t);
477 o->object.type = type;
478 o->object.size = htole64(size);
480 f->header->tail_object_offset = htole64(p);
481 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
489 static int journal_file_setup_data_hash_table(JournalFile *f) {
496 s = DEFAULT_DATA_HASH_TABLE_SIZE;
497 r = journal_file_append_object(f,
498 OBJECT_DATA_HASH_TABLE,
499 offsetof(Object, hash_table.items) + s,
504 memset(o->hash_table.items, 0, s);
506 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
507 f->header->data_hash_table_size = htole64(s);
512 static int journal_file_setup_field_hash_table(JournalFile *f) {
519 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
520 r = journal_file_append_object(f,
521 OBJECT_FIELD_HASH_TABLE,
522 offsetof(Object, hash_table.items) + s,
527 memset(o->hash_table.items, 0, s);
529 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
530 f->header->field_hash_table_size = htole64(s);
535 static int journal_file_map_data_hash_table(JournalFile *f) {
542 p = le64toh(f->header->data_hash_table_offset);
543 s = le64toh(f->header->data_hash_table_size);
545 r = journal_file_move_to(f,
546 WINDOW_DATA_HASH_TABLE,
552 f->data_hash_table = t;
556 static int journal_file_map_field_hash_table(JournalFile *f) {
563 p = le64toh(f->header->field_hash_table_offset);
564 s = le64toh(f->header->field_hash_table_size);
566 r = journal_file_move_to(f,
567 WINDOW_FIELD_HASH_TABLE,
573 f->field_hash_table = t;
577 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
584 assert(o->object.type == OBJECT_DATA);
586 /* This might alter the window we are looking at */
588 o->data.next_hash_offset = o->data.next_field_offset = 0;
589 o->data.entry_offset = o->data.entry_array_offset = 0;
590 o->data.n_entries = 0;
592 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
593 p = le64toh(f->data_hash_table[h].head_hash_offset);
595 /* Only entry in the hash table is easy */
596 f->data_hash_table[h].head_hash_offset = htole64(offset);
598 /* Move back to the previous data object, to patch in
601 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
605 o->data.next_hash_offset = htole64(offset);
608 f->data_hash_table[h].tail_hash_offset = htole64(offset);
613 int journal_file_find_data_object_with_hash(
615 const void *data, uint64_t size, uint64_t hash,
616 Object **ret, uint64_t *offset) {
618 uint64_t p, osize, h;
622 assert(data || size == 0);
624 osize = offsetof(Object, data.payload) + size;
626 if (f->header->data_hash_table_size == 0)
629 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
630 p = le64toh(f->data_hash_table[h].head_hash_offset);
635 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
639 if (le64toh(o->data.hash) != hash)
642 if (o->object.flags & OBJECT_COMPRESSED) {
646 l = le64toh(o->object.size);
647 if (l <= offsetof(Object, data.payload))
650 l -= offsetof(Object, data.payload);
652 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
656 memcmp(f->compress_buffer, data, size) == 0) {
667 return -EPROTONOSUPPORT;
670 } else if (le64toh(o->object.size) == osize &&
671 memcmp(o->data.payload, data, size) == 0) {
683 p = le64toh(o->data.next_hash_offset);
689 int journal_file_find_data_object(
691 const void *data, uint64_t size,
692 Object **ret, uint64_t *offset) {
697 assert(data || size == 0);
699 hash = hash64(data, size);
701 return journal_file_find_data_object_with_hash(f,
706 static int journal_file_append_data(
708 const void *data, uint64_t size,
709 Object **ret, uint64_t *offset) {
715 bool compressed = false;
718 assert(data || size == 0);
720 hash = hash64(data, size);
722 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
736 osize = offsetof(Object, data.payload) + size;
737 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
741 o->data.hash = htole64(hash);
745 size >= COMPRESSION_SIZE_THRESHOLD) {
748 compressed = compress_blob(data, size, o->data.payload, &rsize);
751 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
752 o->object.flags |= OBJECT_COMPRESSED;
754 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
756 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
762 memcpy(o->data.payload, data, size);
764 r = journal_file_link_data(f, o, p, hash);
768 /* The linking might have altered the window, so let's
769 * refresh our pointer */
770 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
783 uint64_t journal_file_entry_n_items(Object *o) {
785 assert(o->object.type == OBJECT_ENTRY);
787 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
790 static uint64_t journal_file_entry_array_n_items(Object *o) {
792 assert(o->object.type == OBJECT_ENTRY_ARRAY);
794 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
797 static int link_entry_into_array(JournalFile *f,
802 uint64_t n = 0, ap = 0, q, i, a, hidx;
811 i = hidx = le64toh(*idx);
814 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
818 n = journal_file_entry_array_n_items(o);
820 o->entry_array.items[i] = htole64(p);
821 *idx = htole64(hidx + 1);
827 a = le64toh(o->entry_array.next_entry_array_offset);
838 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
839 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
844 o->entry_array.items[i] = htole64(p);
849 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
853 o->entry_array.next_entry_array_offset = htole64(q);
856 *idx = htole64(hidx + 1);
861 static int link_entry_into_array_plus_one(JournalFile *f,
880 i = htole64(le64toh(*idx) - 1);
881 r = link_entry_into_array(f, first, &i, p);
886 *idx = htole64(le64toh(*idx) + 1);
890 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
897 p = le64toh(o->entry.items[i].object_offset);
901 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
905 return link_entry_into_array_plus_one(f,
906 &o->data.entry_offset,
907 &o->data.entry_array_offset,
912 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
919 assert(o->object.type == OBJECT_ENTRY);
921 __sync_synchronize();
923 /* Link up the entry itself */
924 r = link_entry_into_array(f,
925 &f->header->entry_array_offset,
926 &f->header->n_entries,
931 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
933 if (f->header->head_entry_realtime == 0)
934 f->header->head_entry_realtime = o->entry.realtime;
936 f->header->tail_entry_realtime = o->entry.realtime;
937 f->header->tail_entry_monotonic = o->entry.monotonic;
939 f->tail_entry_monotonic_valid = true;
941 /* Link up the items */
942 n = journal_file_entry_n_items(o);
943 for (i = 0; i < n; i++) {
944 r = journal_file_link_entry_item(f, o, offset, i);
952 static int journal_file_append_entry_internal(
954 const dual_timestamp *ts,
956 const EntryItem items[], unsigned n_items,
958 Object **ret, uint64_t *offset) {
965 assert(items || n_items == 0);
968 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
970 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
974 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
975 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
976 o->entry.realtime = htole64(ts->realtime);
977 o->entry.monotonic = htole64(ts->monotonic);
978 o->entry.xor_hash = htole64(xor_hash);
979 o->entry.boot_id = f->header->boot_id;
981 r = journal_file_link_entry(f, o, np);
994 void journal_file_post_change(JournalFile *f) {
997 /* inotify() does not receive IN_MODIFY events from file
998 * accesses done via mmap(). After each access we hence
999 * trigger IN_MODIFY by truncating the journal file to its
1000 * current size which triggers IN_MODIFY. */
1002 __sync_synchronize();
1004 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1005 log_error("Failed to to truncate file to its own size: %m");
1008 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1012 uint64_t xor_hash = 0;
1013 struct dual_timestamp _ts;
1016 assert(iovec || n_iovec == 0);
1022 dual_timestamp_get(&_ts);
1026 if (f->tail_entry_monotonic_valid &&
1027 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1030 items = alloca(sizeof(EntryItem) * n_iovec);
1032 for (i = 0; i < n_iovec; i++) {
1036 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1040 xor_hash ^= le64toh(o->data.hash);
1041 items[i].object_offset = htole64(p);
1042 items[i].hash = o->data.hash;
1045 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1047 journal_file_post_change(f);
1052 static int generic_array_get(JournalFile *f,
1055 Object **ret, uint64_t *offset) {
1067 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1071 n = journal_file_entry_array_n_items(o);
1073 p = le64toh(o->entry_array.items[i]);
1078 a = le64toh(o->entry_array.next_entry_array_offset);
1081 if (a <= 0 || p <= 0)
1084 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1097 static int generic_array_get_plus_one(JournalFile *f,
1101 Object **ret, uint64_t *offset) {
1110 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1123 return generic_array_get(f, first, i-1, ret, offset);
1132 static int generic_array_bisect(JournalFile *f,
1136 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1137 direction_t direction,
1142 uint64_t a, p, t = 0, i = 0, last_p = 0;
1143 bool subtract_one = false;
1144 Object *o, *array = NULL;
1148 assert(test_object);
1152 uint64_t left, right, k, lp;
1154 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1158 k = journal_file_entry_array_n_items(array);
1164 lp = p = le64toh(array->entry_array.items[i]);
1168 r = test_object(f, p, needle);
1172 if (r == TEST_FOUND)
1173 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1175 if (r == TEST_RIGHT) {
1179 if (left == right) {
1180 if (direction == DIRECTION_UP)
1181 subtract_one = true;
1187 assert(left < right);
1189 i = (left + right) / 2;
1190 p = le64toh(array->entry_array.items[i]);
1194 r = test_object(f, p, needle);
1198 if (r == TEST_FOUND)
1199 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1201 if (r == TEST_RIGHT)
1215 a = le64toh(array->entry_array.next_entry_array_offset);
1221 if (subtract_one && t == 0 && i == 0)
1224 if (subtract_one && i == 0)
1226 else if (subtract_one)
1227 p = le64toh(array->entry_array.items[i-1]);
1229 p = le64toh(array->entry_array.items[i]);
1231 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1242 *idx = t + i - (subtract_one ? 1 : 0);
1247 static int generic_array_bisect_plus_one(JournalFile *f,
1252 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1253 direction_t direction,
1261 assert(test_object);
1266 /* This bisects the array in object 'first', but first checks
1268 r = test_object(f, extra, needle);
1271 else if (r == TEST_FOUND) {
1274 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1288 } else if (r == TEST_RIGHT)
1291 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1299 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1306 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1310 if (le64toh(o->entry.seqnum) == needle)
1312 else if (le64toh(o->entry.seqnum) < needle)
1318 int journal_file_move_to_entry_by_seqnum(
1321 direction_t direction,
1325 return generic_array_bisect(f,
1326 le64toh(f->header->entry_array_offset),
1327 le64toh(f->header->n_entries),
1334 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1341 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1345 if (le64toh(o->entry.realtime) == needle)
1347 else if (le64toh(o->entry.realtime) < needle)
1353 int journal_file_move_to_entry_by_realtime(
1356 direction_t direction,
1360 return generic_array_bisect(f,
1361 le64toh(f->header->entry_array_offset),
1362 le64toh(f->header->n_entries),
1364 test_object_realtime,
1369 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1376 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1380 if (le64toh(o->entry.monotonic) == needle)
1382 else if (le64toh(o->entry.monotonic) < needle)
1388 int journal_file_move_to_entry_by_monotonic(
1392 direction_t direction,
1396 char t[8+32+1] = "_BOOT_ID=";
1400 sd_id128_to_string(boot_id, t + 8);
1402 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1408 return generic_array_bisect_plus_one(f,
1409 le64toh(o->data.entry_offset),
1410 le64toh(o->data.entry_array_offset),
1411 le64toh(o->data.n_entries),
1413 test_object_monotonic,
1418 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1424 else if (p < needle)
1430 int journal_file_next_entry(
1432 Object *o, uint64_t p,
1433 direction_t direction,
1434 Object **ret, uint64_t *offset) {
1440 assert(p > 0 || !o);
1442 n = le64toh(f->header->n_entries);
1447 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1449 if (o->object.type != OBJECT_ENTRY)
1452 r = generic_array_bisect(f,
1453 le64toh(f->header->entry_array_offset),
1454 le64toh(f->header->n_entries),
1463 if (direction == DIRECTION_DOWN) {
1476 /* And jump to it */
1477 return generic_array_get(f,
1478 le64toh(f->header->entry_array_offset),
1483 int journal_file_skip_entry(
1485 Object *o, uint64_t p,
1487 Object **ret, uint64_t *offset) {
1496 if (o->object.type != OBJECT_ENTRY)
1499 r = generic_array_bisect(f,
1500 le64toh(f->header->entry_array_offset),
1501 le64toh(f->header->n_entries),
1510 /* Calculate new index */
1512 if ((uint64_t) -skip >= i)
1515 i = i - (uint64_t) -skip;
1517 i += (uint64_t) skip;
1519 n = le64toh(f->header->n_entries);
1526 return generic_array_get(f,
1527 le64toh(f->header->entry_array_offset),
1532 int journal_file_next_entry_for_data(
1534 Object *o, uint64_t p,
1535 uint64_t data_offset,
1536 direction_t direction,
1537 Object **ret, uint64_t *offset) {
1544 assert(p > 0 || !o);
1546 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1550 n = le64toh(d->data.n_entries);
1555 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1557 if (o->object.type != OBJECT_ENTRY)
1560 r = generic_array_bisect_plus_one(f,
1561 le64toh(d->data.entry_offset),
1562 le64toh(d->data.entry_array_offset),
1563 le64toh(d->data.n_entries),
1573 if (direction == DIRECTION_DOWN) {
1587 return generic_array_get_plus_one(f,
1588 le64toh(d->data.entry_offset),
1589 le64toh(d->data.entry_array_offset),
1594 int journal_file_move_to_entry_by_seqnum_for_data(
1596 uint64_t data_offset,
1598 direction_t direction,
1599 Object **ret, uint64_t *offset) {
1604 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1608 return generic_array_bisect_plus_one(f,
1609 le64toh(d->data.entry_offset),
1610 le64toh(d->data.entry_array_offset),
1611 le64toh(d->data.n_entries),
1618 int journal_file_move_to_entry_by_realtime_for_data(
1620 uint64_t data_offset,
1622 direction_t direction,
1623 Object **ret, uint64_t *offset) {
1628 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1632 return generic_array_bisect_plus_one(f,
1633 le64toh(d->data.entry_offset),
1634 le64toh(d->data.entry_array_offset),
1635 le64toh(d->data.n_entries),
1637 test_object_realtime,
1642 void journal_file_dump(JournalFile *f) {
1643 char a[33], b[33], c[33];
1650 printf("File Path: %s\n"
1654 "Arena size: %llu\n"
1658 sd_id128_to_string(f->header->file_id, a),
1659 sd_id128_to_string(f->header->machine_id, b),
1660 sd_id128_to_string(f->header->boot_id, c),
1661 (unsigned long long) le64toh(f->header->arena_size),
1662 (unsigned long) le64toh(f->header->n_objects),
1663 (unsigned long) le64toh(f->header->n_entries));
1665 p = le64toh(f->header->arena_offset);
1667 r = journal_file_move_to_object(f, -1, p, &o);
1671 switch (o->object.type) {
1674 printf("Type: OBJECT_UNUSED\n");
1678 printf("Type: OBJECT_DATA\n");
1682 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1683 (unsigned long long) le64toh(o->entry.seqnum),
1684 (unsigned long long) le64toh(o->entry.monotonic),
1685 (unsigned long long) le64toh(o->entry.realtime));
1688 case OBJECT_FIELD_HASH_TABLE:
1689 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1692 case OBJECT_DATA_HASH_TABLE:
1693 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1696 case OBJECT_ENTRY_ARRAY:
1697 printf("Type: OBJECT_ENTRY_ARRAY\n");
1701 if (o->object.flags & OBJECT_COMPRESSED)
1702 printf("Flags: COMPRESSED\n");
1704 if (p == le64toh(f->header->tail_object_offset))
1707 p = p + ALIGN64(le64toh(o->object.size));
1712 log_error("File corrupt");
1715 int journal_file_open(
1719 JournalFile *template,
1720 JournalFile **ret) {
1724 bool newly_created = false;
1728 if ((flags & O_ACCMODE) != O_RDONLY &&
1729 (flags & O_ACCMODE) != O_RDWR)
1732 if (!endswith(fname, ".journal"))
1735 f = new0(JournalFile, 1);
1742 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1743 f->prot = prot_from_flags(flags);
1746 f->metrics = template->metrics;
1747 f->compress = template->compress;
1750 f->path = strdup(fname);
1756 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1762 if (fstat(f->fd, &f->last_stat) < 0) {
1767 if (f->last_stat.st_size == 0 && f->writable) {
1768 newly_created = true;
1770 r = journal_file_init_header(f, template);
1774 if (fstat(f->fd, &f->last_stat) < 0) {
1780 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1785 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1786 if (f->header == MAP_FAILED) {
1792 if (!newly_created) {
1793 r = journal_file_verify_header(f);
1799 r = journal_file_refresh_header(f);
1804 if (newly_created) {
1806 r = journal_file_setup_field_hash_table(f);
1810 r = journal_file_setup_data_hash_table(f);
1815 r = journal_file_map_field_hash_table(f);
1819 r = journal_file_map_data_hash_table(f);
1829 journal_file_close(f);
1834 int journal_file_rotate(JournalFile **f) {
1837 JournalFile *old_file, *new_file = NULL;
1845 if (!old_file->writable)
1848 if (!endswith(old_file->path, ".journal"))
1851 l = strlen(old_file->path);
1853 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1857 memcpy(p, old_file->path, l - 8);
1859 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1860 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1861 "-%016llx-%016llx.journal",
1862 (unsigned long long) le64toh((*f)->header->seqnum),
1863 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1865 r = rename(old_file->path, p);
1871 old_file->header->state = STATE_ARCHIVED;
1873 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1874 journal_file_close(old_file);
1880 int journal_file_open_reliably(
1884 JournalFile *template,
1885 JournalFile **ret) {
1891 r = journal_file_open(fname, flags, mode, template, ret);
1892 if (r != -EBADMSG && /* corrupted */
1893 r != -ENODATA && /* truncated */
1894 r != -EHOSTDOWN && /* other machine */
1895 r != -EPROTONOSUPPORT) /* incompatible feature */
1898 if ((flags & O_ACCMODE) == O_RDONLY)
1901 if (!(flags & O_CREAT))
1904 /* The file is corrupted. Rotate it away and try it again (but only once) */
1907 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1909 (unsigned long long) now(CLOCK_REALTIME),
1913 r = rename(fname, p);
1918 log_warning("File %s corrupted, renaming and replacing.", fname);
1920 return journal_file_open(fname, flags, mode, template, ret);
1923 struct vacuum_info {
1928 sd_id128_t seqnum_id;
1934 static int vacuum_compare(const void *_a, const void *_b) {
1935 const struct vacuum_info *a, *b;
1940 if (a->have_seqnum && b->have_seqnum &&
1941 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1942 if (a->seqnum < b->seqnum)
1944 else if (a->seqnum > b->seqnum)
1950 if (a->realtime < b->realtime)
1952 else if (a->realtime > b->realtime)
1954 else if (a->have_seqnum && b->have_seqnum)
1955 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1957 return strcmp(a->filename, b->filename);
1960 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1963 struct vacuum_info *list = NULL;
1964 unsigned n_list = 0, n_allocated = 0, i;
1972 d = opendir(directory);
1978 struct dirent buf, *de;
1982 unsigned long long seqnum = 0, realtime;
1983 sd_id128_t seqnum_id;
1986 k = readdir_r(d, &buf, &de);
1995 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1998 if (!S_ISREG(st.st_mode))
2001 q = strlen(de->d_name);
2003 if (endswith(de->d_name, ".journal")) {
2005 /* Vacuum archived files */
2007 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2010 if (de->d_name[q-8-16-1] != '-' ||
2011 de->d_name[q-8-16-1-16-1] != '-' ||
2012 de->d_name[q-8-16-1-16-1-32-1] != '@')
2015 p = strdup(de->d_name);
2021 de->d_name[q-8-16-1-16-1] = 0;
2022 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2027 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2034 } else if (endswith(de->d_name, ".journal~")) {
2035 unsigned long long tmp;
2037 /* Vacuum corrupted files */
2039 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2042 if (de->d_name[q-1-8-16-1] != '-' ||
2043 de->d_name[q-1-8-16-1-16-1] != '@')
2046 p = strdup(de->d_name);
2052 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2057 have_seqnum = false;
2061 if (n_list >= n_allocated) {
2062 struct vacuum_info *j;
2064 n_allocated = MAX(n_allocated * 2U, 8U);
2065 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2075 list[n_list].filename = p;
2076 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2077 list[n_list].seqnum = seqnum;
2078 list[n_list].realtime = realtime;
2079 list[n_list].seqnum_id = seqnum_id;
2080 list[n_list].have_seqnum = have_seqnum;
2082 sum += list[n_list].usage;
2087 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2089 for(i = 0; i < n_list; i++) {
2092 if (fstatvfs(dirfd(d), &ss) < 0) {
2097 if (sum <= max_use &&
2098 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2101 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2102 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2103 sum -= list[i].usage;
2104 } else if (errno != ENOENT)
2105 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2109 for (i = 0; i < n_list; i++)
2110 free(list[i].filename);
2120 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2122 uint64_t q, xor_hash = 0;
2135 ts.monotonic = le64toh(o->entry.monotonic);
2136 ts.realtime = le64toh(o->entry.realtime);
2138 if (to->tail_entry_monotonic_valid &&
2139 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2142 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2145 n = journal_file_entry_n_items(o);
2146 items = alloca(sizeof(EntryItem) * n);
2148 for (i = 0; i < n; i++) {
2155 q = le64toh(o->entry.items[i].object_offset);
2156 le_hash = o->entry.items[i].hash;
2158 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2162 if (le_hash != o->data.hash)
2165 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2168 /* We hit the limit on 32bit machines */
2169 if ((uint64_t) t != l)
2172 if (o->object.flags & OBJECT_COMPRESSED) {
2176 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2179 data = from->compress_buffer;
2182 return -EPROTONOSUPPORT;
2185 data = o->data.payload;
2187 r = journal_file_append_data(to, data, l, &u, &h);
2191 xor_hash ^= le64toh(u->data.hash);
2192 items[i].object_offset = htole64(h);
2193 items[i].hash = u->data.hash;
2195 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2200 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2203 void journal_default_metrics(JournalMetrics *m, int fd) {
2204 uint64_t fs_size = 0;
2206 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2211 if (fstatvfs(fd, &ss) >= 0)
2212 fs_size = ss.f_frsize * ss.f_blocks;
2214 if (m->max_use == (uint64_t) -1) {
2217 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2219 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2220 m->max_use = DEFAULT_MAX_USE_UPPER;
2222 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2223 m->max_use = DEFAULT_MAX_USE_LOWER;
2225 m->max_use = DEFAULT_MAX_USE_LOWER;
2227 m->max_use = PAGE_ALIGN(m->max_use);
2229 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2230 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2233 if (m->max_size == (uint64_t) -1) {
2234 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2236 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2237 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2239 m->max_size = PAGE_ALIGN(m->max_size);
2241 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2242 m->max_size = JOURNAL_FILE_SIZE_MIN;
2244 if (m->max_size*2 > m->max_use)
2245 m->max_use = m->max_size*2;
2247 if (m->min_size == (uint64_t) -1)
2248 m->min_size = JOURNAL_FILE_SIZE_MIN;
2250 m->min_size = PAGE_ALIGN(m->min_size);
2252 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2253 m->min_size = JOURNAL_FILE_SIZE_MIN;
2255 if (m->min_size > m->max_size)
2256 m->max_size = m->min_size;
2259 if (m->keep_free == (uint64_t) -1) {
2262 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2264 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2265 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2268 m->keep_free = DEFAULT_KEEP_FREE;
2271 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2272 format_bytes(a, sizeof(a), m->max_use),
2273 format_bytes(b, sizeof(b), m->max_size),
2274 format_bytes(c, sizeof(c), m->min_size),
2275 format_bytes(d, sizeof(d), m->keep_free));