1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "journal-def.h"
31 #include "journal-file.h"
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
53 /* This is the upper bound if we deduce the keep_free value from the
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57 /* This is the keep_free value when we can't determine the system
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65 void journal_file_close(JournalFile *f) {
72 f->header->state = STATE_OFFLINE;
74 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
77 for (t = 0; t < _WINDOW_MAX; t++)
78 if (f->windows[t].ptr)
79 munmap(f->windows[t].ptr, f->windows[t].size);
82 close_nointr_nofail(f->fd);
87 free(f->compress_buffer);
93 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
101 memcpy(h.signature, signature, 8);
102 h.header_size = htole64(ALIGN64(sizeof(h)));
104 r = sd_id128_randomize(&h.file_id);
109 h.seqnum_id = template->header->seqnum_id;
110 h.seqnum = template->header->seqnum;
112 h.seqnum_id = h.file_id;
114 k = pwrite(f->fd, &h, sizeof(h), 0);
124 static int journal_file_refresh_header(JournalFile *f) {
130 r = sd_id128_get_machine(&f->header->machine_id);
134 r = sd_id128_get_boot(&boot_id);
138 if (sd_id128_equal(boot_id, f->header->boot_id))
139 f->tail_entry_monotonic_valid = true;
141 f->header->boot_id = boot_id;
143 f->header->state = STATE_ONLINE;
145 __sync_synchronize();
150 static int journal_file_verify_header(JournalFile *f) {
153 if (memcmp(f->header, signature, 8))
157 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
158 return -EPROTONOSUPPORT;
160 if (f->header->incompatible_flags != 0)
161 return -EPROTONOSUPPORT;
164 if (f->header->header_size != htole64(ALIGN64(sizeof(*(f->header)))))
167 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
172 sd_id128_t machine_id;
175 r = sd_id128_get_machine(&machine_id);
179 if (!sd_id128_equal(machine_id, f->header->machine_id))
182 state = f->header->state;
184 if (state == STATE_ONLINE)
185 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
186 /* FIXME: immediately rotate */
187 else if (state == STATE_ARCHIVED)
189 else if (state != STATE_OFFLINE)
190 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
196 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
197 uint64_t old_size, new_size;
202 /* We assume that this file is not sparse, and we know that
203 * for sure, since we always call posix_fallocate()
207 le64toh(f->header->header_size) +
208 le64toh(f->header->arena_size);
210 new_size = PAGE_ALIGN(offset + size);
211 if (new_size < le64toh(f->header->header_size))
212 new_size = le64toh(f->header->header_size);
214 if (new_size <= old_size)
217 if (f->metrics.max_size > 0 &&
218 new_size > f->metrics.max_size)
221 if (new_size > f->metrics.min_size &&
222 f->metrics.keep_free > 0) {
225 if (fstatvfs(f->fd, &svfs) >= 0) {
228 available = svfs.f_bfree * svfs.f_bsize;
230 if (available >= f->metrics.keep_free)
231 available -= f->metrics.keep_free;
235 if (new_size - old_size > available)
240 /* Note that the glibc fallocate() fallback is very
241 inefficient, hence we try to minimize the allocation area
243 r = posix_fallocate(f->fd, old_size, new_size - old_size);
247 if (fstat(f->fd, &f->last_stat) < 0)
250 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
255 static int journal_file_map(
264 uint64_t woffset, wsize;
271 woffset = offset & ~((uint64_t) page_size() - 1ULL);
272 wsize = size + (offset - woffset);
273 wsize = PAGE_ALIGN(wsize);
275 /* Avoid SIGBUS on invalid accesses */
276 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
277 return -EADDRNOTAVAIL;
279 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
280 if (window == MAP_FAILED)
292 *ret = (uint8_t*) window + (offset - woffset);
297 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
306 assert(wt < _WINDOW_MAX);
308 if (offset + size > (uint64_t) f->last_stat.st_size) {
309 /* Hmm, out of range? Let's refresh the fstat() data
310 * first, before we trust that check. */
312 if (fstat(f->fd, &f->last_stat) < 0 ||
313 offset + size > (uint64_t) f->last_stat.st_size)
314 return -EADDRNOTAVAIL;
319 if (_likely_(w->ptr &&
320 w->offset <= offset &&
321 w->offset + w->size >= offset + size)) {
323 *ret = (uint8_t*) w->ptr + (offset - w->offset);
328 if (munmap(w->ptr, w->size) < 0)
332 w->size = w->offset = 0;
335 if (size < DEFAULT_WINDOW_SIZE) {
336 /* If the default window size is larger then what was
337 * asked for extend the mapping a bit in the hope to
338 * minimize needed remappings later on. We add half
339 * the window space before and half behind the
340 * requested mapping */
342 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
348 size = DEFAULT_WINDOW_SIZE;
352 if (offset + size > (uint64_t) f->last_stat.st_size)
353 size = (uint64_t) f->last_stat.st_size - offset;
356 return -EADDRNOTAVAIL;
358 r = journal_file_map(f,
360 &w->ptr, &w->offset, &w->size,
366 *ret = (uint8_t*) p + delta;
370 static bool verify_hash(Object *o) {
375 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
376 h1 = le64toh(o->data.hash);
377 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
378 } else if (o->object.type == OBJECT_FIELD) {
379 h1 = le64toh(o->field.hash);
380 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
387 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
395 assert(type < _OBJECT_TYPE_MAX);
397 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
402 s = le64toh(o->object.size);
404 if (s < sizeof(ObjectHeader))
407 if (type >= 0 && o->object.type != type)
410 if (s > sizeof(ObjectHeader)) {
411 r = journal_file_move_to(f, o->object.type, offset, s, &t);
425 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
430 r = le64toh(f->header->seqnum) + 1;
433 /* If an external seqnum counter was passed, we update
434 * both the local and the external one, and set it to
435 * the maximum of both */
443 f->header->seqnum = htole64(r);
445 if (f->header->first_seqnum == 0)
446 f->header->first_seqnum = htole64(r);
451 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
458 assert(size >= sizeof(ObjectHeader));
462 p = le64toh(f->header->tail_object_offset);
464 p = le64toh(f->header->header_size);
466 r = journal_file_move_to_object(f, -1, p, &tail);
470 p += ALIGN64(le64toh(tail->object.size));
473 r = journal_file_allocate(f, p, size);
477 r = journal_file_move_to(f, type, p, size, &t);
484 o->object.type = type;
485 o->object.size = htole64(size);
487 f->header->tail_object_offset = htole64(p);
488 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
496 static int journal_file_setup_data_hash_table(JournalFile *f) {
503 s = DEFAULT_DATA_HASH_TABLE_SIZE;
504 r = journal_file_append_object(f,
505 OBJECT_DATA_HASH_TABLE,
506 offsetof(Object, hash_table.items) + s,
511 memset(o->hash_table.items, 0, s);
513 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
514 f->header->data_hash_table_size = htole64(s);
519 static int journal_file_setup_field_hash_table(JournalFile *f) {
526 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
527 r = journal_file_append_object(f,
528 OBJECT_FIELD_HASH_TABLE,
529 offsetof(Object, hash_table.items) + s,
534 memset(o->hash_table.items, 0, s);
536 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
537 f->header->field_hash_table_size = htole64(s);
542 static int journal_file_map_data_hash_table(JournalFile *f) {
549 p = le64toh(f->header->data_hash_table_offset);
550 s = le64toh(f->header->data_hash_table_size);
552 r = journal_file_move_to(f,
553 WINDOW_DATA_HASH_TABLE,
559 f->data_hash_table = t;
563 static int journal_file_map_field_hash_table(JournalFile *f) {
570 p = le64toh(f->header->field_hash_table_offset);
571 s = le64toh(f->header->field_hash_table_size);
573 r = journal_file_move_to(f,
574 WINDOW_FIELD_HASH_TABLE,
580 f->field_hash_table = t;
584 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
591 assert(o->object.type == OBJECT_DATA);
593 /* This might alter the window we are looking at */
595 o->data.next_hash_offset = o->data.next_field_offset = 0;
596 o->data.entry_offset = o->data.entry_array_offset = 0;
597 o->data.n_entries = 0;
599 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
600 p = le64toh(f->data_hash_table[h].tail_hash_offset);
602 /* Only entry in the hash table is easy */
603 f->data_hash_table[h].head_hash_offset = htole64(offset);
605 /* Move back to the previous data object, to patch in
608 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
612 o->data.next_hash_offset = htole64(offset);
615 f->data_hash_table[h].tail_hash_offset = htole64(offset);
620 int journal_file_find_data_object_with_hash(
622 const void *data, uint64_t size, uint64_t hash,
623 Object **ret, uint64_t *offset) {
625 uint64_t p, osize, h;
629 assert(data || size == 0);
631 osize = offsetof(Object, data.payload) + size;
633 if (f->header->data_hash_table_size == 0)
636 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
637 p = le64toh(f->data_hash_table[h].head_hash_offset);
642 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
646 if (le64toh(o->data.hash) != hash)
649 if (o->object.flags & OBJECT_COMPRESSED) {
653 l = le64toh(o->object.size);
654 if (l <= offsetof(Object, data.payload))
657 l -= offsetof(Object, data.payload);
659 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
663 memcmp(f->compress_buffer, data, size) == 0) {
674 return -EPROTONOSUPPORT;
677 } else if (le64toh(o->object.size) == osize &&
678 memcmp(o->data.payload, data, size) == 0) {
690 p = le64toh(o->data.next_hash_offset);
696 int journal_file_find_data_object(
698 const void *data, uint64_t size,
699 Object **ret, uint64_t *offset) {
704 assert(data || size == 0);
706 hash = hash64(data, size);
708 return journal_file_find_data_object_with_hash(f,
713 static int journal_file_append_data(
715 const void *data, uint64_t size,
716 Object **ret, uint64_t *offset) {
722 bool compressed = false;
725 assert(data || size == 0);
727 hash = hash64(data, size);
729 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
743 osize = offsetof(Object, data.payload) + size;
744 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
748 o->data.hash = htole64(hash);
752 size >= COMPRESSION_SIZE_THRESHOLD) {
755 compressed = compress_blob(data, size, o->data.payload, &rsize);
758 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
759 o->object.flags |= OBJECT_COMPRESSED;
761 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
763 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
769 memcpy(o->data.payload, data, size);
771 r = journal_file_link_data(f, o, p, hash);
775 /* The linking might have altered the window, so let's
776 * refresh our pointer */
777 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
790 uint64_t journal_file_entry_n_items(Object *o) {
792 assert(o->object.type == OBJECT_ENTRY);
794 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
797 static uint64_t journal_file_entry_array_n_items(Object *o) {
799 assert(o->object.type == OBJECT_ENTRY_ARRAY);
801 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
804 static int link_entry_into_array(JournalFile *f,
809 uint64_t n = 0, ap = 0, q, i, a, hidx;
818 i = hidx = le64toh(*idx);
821 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
825 n = journal_file_entry_array_n_items(o);
827 o->entry_array.items[i] = htole64(p);
828 *idx = htole64(hidx + 1);
834 a = le64toh(o->entry_array.next_entry_array_offset);
845 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
846 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
851 o->entry_array.items[i] = htole64(p);
856 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
860 o->entry_array.next_entry_array_offset = htole64(q);
863 *idx = htole64(hidx + 1);
868 static int link_entry_into_array_plus_one(JournalFile *f,
887 i = htole64(le64toh(*idx) - 1);
888 r = link_entry_into_array(f, first, &i, p);
893 *idx = htole64(le64toh(*idx) + 1);
897 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
904 p = le64toh(o->entry.items[i].object_offset);
908 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
912 return link_entry_into_array_plus_one(f,
913 &o->data.entry_offset,
914 &o->data.entry_array_offset,
919 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
926 assert(o->object.type == OBJECT_ENTRY);
928 __sync_synchronize();
930 /* Link up the entry itself */
931 r = link_entry_into_array(f,
932 &f->header->entry_array_offset,
933 &f->header->n_entries,
938 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
940 if (f->header->head_entry_realtime == 0)
941 f->header->head_entry_realtime = o->entry.realtime;
943 f->header->tail_entry_realtime = o->entry.realtime;
944 f->header->tail_entry_monotonic = o->entry.monotonic;
946 f->tail_entry_monotonic_valid = true;
948 /* Link up the items */
949 n = journal_file_entry_n_items(o);
950 for (i = 0; i < n; i++) {
951 r = journal_file_link_entry_item(f, o, offset, i);
959 static int journal_file_append_entry_internal(
961 const dual_timestamp *ts,
963 const EntryItem items[], unsigned n_items,
965 Object **ret, uint64_t *offset) {
972 assert(items || n_items == 0);
975 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
977 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
981 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
982 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
983 o->entry.realtime = htole64(ts->realtime);
984 o->entry.monotonic = htole64(ts->monotonic);
985 o->entry.xor_hash = htole64(xor_hash);
986 o->entry.boot_id = f->header->boot_id;
988 r = journal_file_link_entry(f, o, np);
1001 void journal_file_post_change(JournalFile *f) {
1004 /* inotify() does not receive IN_MODIFY events from file
1005 * accesses done via mmap(). After each access we hence
1006 * trigger IN_MODIFY by truncating the journal file to its
1007 * current size which triggers IN_MODIFY. */
1009 __sync_synchronize();
1011 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1012 log_error("Failed to to truncate file to its own size: %m");
1015 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1019 uint64_t xor_hash = 0;
1020 struct dual_timestamp _ts;
1023 assert(iovec || n_iovec == 0);
1029 dual_timestamp_get(&_ts);
1033 if (f->tail_entry_monotonic_valid &&
1034 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1037 items = alloca(sizeof(EntryItem) * n_iovec);
1039 for (i = 0; i < n_iovec; i++) {
1043 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1047 xor_hash ^= le64toh(o->data.hash);
1048 items[i].object_offset = htole64(p);
1049 items[i].hash = o->data.hash;
1052 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1054 journal_file_post_change(f);
1059 static int generic_array_get(JournalFile *f,
1062 Object **ret, uint64_t *offset) {
1074 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1078 n = journal_file_entry_array_n_items(o);
1080 p = le64toh(o->entry_array.items[i]);
1085 a = le64toh(o->entry_array.next_entry_array_offset);
1088 if (a <= 0 || p <= 0)
1091 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1104 static int generic_array_get_plus_one(JournalFile *f,
1108 Object **ret, uint64_t *offset) {
1117 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1130 return generic_array_get(f, first, i-1, ret, offset);
1139 static int generic_array_bisect(JournalFile *f,
1143 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1144 direction_t direction,
1149 uint64_t a, p, t = 0, i = 0, last_p = 0;
1150 bool subtract_one = false;
1151 Object *o, *array = NULL;
1155 assert(test_object);
1159 uint64_t left, right, k, lp;
1161 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1165 k = journal_file_entry_array_n_items(array);
1171 lp = p = le64toh(array->entry_array.items[i]);
1175 r = test_object(f, p, needle);
1179 if (r == TEST_FOUND)
1180 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1182 if (r == TEST_RIGHT) {
1186 if (left == right) {
1187 if (direction == DIRECTION_UP)
1188 subtract_one = true;
1194 assert(left < right);
1196 i = (left + right) / 2;
1197 p = le64toh(array->entry_array.items[i]);
1201 r = test_object(f, p, needle);
1205 if (r == TEST_FOUND)
1206 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1208 if (r == TEST_RIGHT)
1216 if (direction == DIRECTION_UP) {
1218 subtract_one = true;
1229 a = le64toh(array->entry_array.next_entry_array_offset);
1235 if (subtract_one && t == 0 && i == 0)
1238 if (subtract_one && i == 0)
1240 else if (subtract_one)
1241 p = le64toh(array->entry_array.items[i-1]);
1243 p = le64toh(array->entry_array.items[i]);
1245 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1256 *idx = t + i + (subtract_one ? -1 : 0);
1261 static int generic_array_bisect_plus_one(JournalFile *f,
1266 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1267 direction_t direction,
1273 bool step_back = false;
1277 assert(test_object);
1282 /* This bisects the array in object 'first', but first checks
1284 r = test_object(f, extra, needle);
1288 if (r == TEST_FOUND)
1289 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1291 /* if we are looking with DIRECTION_UP then we need to first
1292 see if in the actual array there is a matching entry, and
1293 return the last one of that. But if there isn't any we need
1294 to return this one. Hence remember this, and return it
1297 step_back = direction == DIRECTION_UP;
1299 if (r == TEST_RIGHT) {
1300 if (direction == DIRECTION_DOWN)
1306 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1308 if (r == 0 && step_back)
1317 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1333 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1339 else if (p < needle)
1345 int journal_file_move_to_entry_by_offset(
1348 direction_t direction,
1352 return generic_array_bisect(f,
1353 le64toh(f->header->entry_array_offset),
1354 le64toh(f->header->n_entries),
1362 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1369 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1373 if (le64toh(o->entry.seqnum) == needle)
1375 else if (le64toh(o->entry.seqnum) < needle)
1381 int journal_file_move_to_entry_by_seqnum(
1384 direction_t direction,
1388 return generic_array_bisect(f,
1389 le64toh(f->header->entry_array_offset),
1390 le64toh(f->header->n_entries),
1397 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1404 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1408 if (le64toh(o->entry.realtime) == needle)
1410 else if (le64toh(o->entry.realtime) < needle)
1416 int journal_file_move_to_entry_by_realtime(
1419 direction_t direction,
1423 return generic_array_bisect(f,
1424 le64toh(f->header->entry_array_offset),
1425 le64toh(f->header->n_entries),
1427 test_object_realtime,
1432 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1439 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1443 if (le64toh(o->entry.monotonic) == needle)
1445 else if (le64toh(o->entry.monotonic) < needle)
1451 int journal_file_move_to_entry_by_monotonic(
1455 direction_t direction,
1459 char t[9+32+1] = "_BOOT_ID=";
1465 sd_id128_to_string(boot_id, t + 9);
1466 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1472 return generic_array_bisect_plus_one(f,
1473 le64toh(o->data.entry_offset),
1474 le64toh(o->data.entry_array_offset),
1475 le64toh(o->data.n_entries),
1477 test_object_monotonic,
1482 int journal_file_next_entry(
1484 Object *o, uint64_t p,
1485 direction_t direction,
1486 Object **ret, uint64_t *offset) {
1492 assert(p > 0 || !o);
1494 n = le64toh(f->header->n_entries);
1499 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1501 if (o->object.type != OBJECT_ENTRY)
1504 r = generic_array_bisect(f,
1505 le64toh(f->header->entry_array_offset),
1506 le64toh(f->header->n_entries),
1515 if (direction == DIRECTION_DOWN) {
1528 /* And jump to it */
1529 return generic_array_get(f,
1530 le64toh(f->header->entry_array_offset),
1535 int journal_file_skip_entry(
1537 Object *o, uint64_t p,
1539 Object **ret, uint64_t *offset) {
1548 if (o->object.type != OBJECT_ENTRY)
1551 r = generic_array_bisect(f,
1552 le64toh(f->header->entry_array_offset),
1553 le64toh(f->header->n_entries),
1562 /* Calculate new index */
1564 if ((uint64_t) -skip >= i)
1567 i = i - (uint64_t) -skip;
1569 i += (uint64_t) skip;
1571 n = le64toh(f->header->n_entries);
1578 return generic_array_get(f,
1579 le64toh(f->header->entry_array_offset),
1584 int journal_file_next_entry_for_data(
1586 Object *o, uint64_t p,
1587 uint64_t data_offset,
1588 direction_t direction,
1589 Object **ret, uint64_t *offset) {
1596 assert(p > 0 || !o);
1598 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1602 n = le64toh(d->data.n_entries);
1607 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1609 if (o->object.type != OBJECT_ENTRY)
1612 r = generic_array_bisect_plus_one(f,
1613 le64toh(d->data.entry_offset),
1614 le64toh(d->data.entry_array_offset),
1615 le64toh(d->data.n_entries),
1625 if (direction == DIRECTION_DOWN) {
1639 return generic_array_get_plus_one(f,
1640 le64toh(d->data.entry_offset),
1641 le64toh(d->data.entry_array_offset),
1646 int journal_file_move_to_entry_by_offset_for_data(
1648 uint64_t data_offset,
1650 direction_t direction,
1651 Object **ret, uint64_t *offset) {
1658 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1662 return generic_array_bisect_plus_one(f,
1663 le64toh(d->data.entry_offset),
1664 le64toh(d->data.entry_array_offset),
1665 le64toh(d->data.n_entries),
1672 int journal_file_move_to_entry_by_monotonic_for_data(
1674 uint64_t data_offset,
1677 direction_t direction,
1678 Object **ret, uint64_t *offset) {
1680 char t[9+32+1] = "_BOOT_ID=";
1687 /* First, seek by time */
1688 sd_id128_to_string(boot_id, t + 9);
1689 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1695 r = generic_array_bisect_plus_one(f,
1696 le64toh(o->data.entry_offset),
1697 le64toh(o->data.entry_array_offset),
1698 le64toh(o->data.n_entries),
1700 test_object_monotonic,
1706 /* And now, continue seeking until we find an entry that
1707 * exists in both bisection arrays */
1713 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1717 r = generic_array_bisect_plus_one(f,
1718 le64toh(d->data.entry_offset),
1719 le64toh(d->data.entry_array_offset),
1720 le64toh(d->data.n_entries),
1728 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1732 r = generic_array_bisect_plus_one(f,
1733 le64toh(o->data.entry_offset),
1734 le64toh(o->data.entry_array_offset),
1735 le64toh(o->data.n_entries),
1759 int journal_file_move_to_entry_by_seqnum_for_data(
1761 uint64_t data_offset,
1763 direction_t direction,
1764 Object **ret, uint64_t *offset) {
1771 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1775 return generic_array_bisect_plus_one(f,
1776 le64toh(d->data.entry_offset),
1777 le64toh(d->data.entry_array_offset),
1778 le64toh(d->data.n_entries),
1785 int journal_file_move_to_entry_by_realtime_for_data(
1787 uint64_t data_offset,
1789 direction_t direction,
1790 Object **ret, uint64_t *offset) {
1797 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1801 return generic_array_bisect_plus_one(f,
1802 le64toh(d->data.entry_offset),
1803 le64toh(d->data.entry_array_offset),
1804 le64toh(d->data.n_entries),
1806 test_object_realtime,
1811 void journal_file_dump(JournalFile *f) {
1812 char a[33], b[33], c[33];
1819 printf("File Path: %s\n"
1823 "Arena size: %llu\n"
1827 sd_id128_to_string(f->header->file_id, a),
1828 sd_id128_to_string(f->header->machine_id, b),
1829 sd_id128_to_string(f->header->boot_id, c),
1830 (unsigned long long) le64toh(f->header->arena_size),
1831 (unsigned long) le64toh(f->header->n_objects),
1832 (unsigned long) le64toh(f->header->n_entries));
1834 p = le64toh(f->header->header_size);
1836 r = journal_file_move_to_object(f, -1, p, &o);
1840 switch (o->object.type) {
1843 printf("Type: OBJECT_UNUSED\n");
1847 printf("Type: OBJECT_DATA\n");
1851 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1852 (unsigned long long) le64toh(o->entry.seqnum),
1853 (unsigned long long) le64toh(o->entry.monotonic),
1854 (unsigned long long) le64toh(o->entry.realtime));
1857 case OBJECT_FIELD_HASH_TABLE:
1858 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1861 case OBJECT_DATA_HASH_TABLE:
1862 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1865 case OBJECT_ENTRY_ARRAY:
1866 printf("Type: OBJECT_ENTRY_ARRAY\n");
1869 case OBJECT_SIGNATURE:
1870 printf("Type: OBJECT_SIGNATURE\n");
1874 if (o->object.flags & OBJECT_COMPRESSED)
1875 printf("Flags: COMPRESSED\n");
1877 if (p == le64toh(f->header->tail_object_offset))
1880 p = p + ALIGN64(le64toh(o->object.size));
1885 log_error("File corrupt");
1888 int journal_file_open(
1892 JournalFile *template,
1893 JournalFile **ret) {
1897 bool newly_created = false;
1901 if ((flags & O_ACCMODE) != O_RDONLY &&
1902 (flags & O_ACCMODE) != O_RDWR)
1905 if (!endswith(fname, ".journal"))
1908 f = new0(JournalFile, 1);
1915 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1916 f->prot = prot_from_flags(flags);
1919 f->metrics = template->metrics;
1920 f->compress = template->compress;
1923 f->path = strdup(fname);
1929 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1935 if (fstat(f->fd, &f->last_stat) < 0) {
1940 if (f->last_stat.st_size == 0 && f->writable) {
1941 newly_created = true;
1943 r = journal_file_init_header(f, template);
1947 if (fstat(f->fd, &f->last_stat) < 0) {
1953 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1958 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1959 if (f->header == MAP_FAILED) {
1965 if (!newly_created) {
1966 r = journal_file_verify_header(f);
1972 r = journal_file_refresh_header(f);
1977 if (newly_created) {
1979 r = journal_file_setup_field_hash_table(f);
1983 r = journal_file_setup_data_hash_table(f);
1988 r = journal_file_map_field_hash_table(f);
1992 r = journal_file_map_data_hash_table(f);
2002 journal_file_close(f);
2007 int journal_file_rotate(JournalFile **f) {
2010 JournalFile *old_file, *new_file = NULL;
2018 if (!old_file->writable)
2021 if (!endswith(old_file->path, ".journal"))
2024 l = strlen(old_file->path);
2026 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
2030 memcpy(p, old_file->path, l - 8);
2032 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2033 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2034 "-%016llx-%016llx.journal",
2035 (unsigned long long) le64toh((*f)->header->seqnum),
2036 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2038 r = rename(old_file->path, p);
2044 old_file->header->state = STATE_ARCHIVED;
2046 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
2047 journal_file_close(old_file);
2053 int journal_file_open_reliably(
2057 JournalFile *template,
2058 JournalFile **ret) {
2064 r = journal_file_open(fname, flags, mode, template, ret);
2065 if (r != -EBADMSG && /* corrupted */
2066 r != -ENODATA && /* truncated */
2067 r != -EHOSTDOWN && /* other machine */
2068 r != -EPROTONOSUPPORT) /* incompatible feature */
2071 if ((flags & O_ACCMODE) == O_RDONLY)
2074 if (!(flags & O_CREAT))
2077 /* The file is corrupted. Rotate it away and try it again (but only once) */
2080 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2082 (unsigned long long) now(CLOCK_REALTIME),
2086 r = rename(fname, p);
2091 log_warning("File %s corrupted, renaming and replacing.", fname);
2093 return journal_file_open(fname, flags, mode, template, ret);
2096 struct vacuum_info {
2101 sd_id128_t seqnum_id;
2107 static int vacuum_compare(const void *_a, const void *_b) {
2108 const struct vacuum_info *a, *b;
2113 if (a->have_seqnum && b->have_seqnum &&
2114 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
2115 if (a->seqnum < b->seqnum)
2117 else if (a->seqnum > b->seqnum)
2123 if (a->realtime < b->realtime)
2125 else if (a->realtime > b->realtime)
2127 else if (a->have_seqnum && b->have_seqnum)
2128 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
2130 return strcmp(a->filename, b->filename);
2133 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2136 struct vacuum_info *list = NULL;
2137 unsigned n_list = 0, n_allocated = 0, i;
2145 d = opendir(directory);
2151 struct dirent buf, *de;
2155 unsigned long long seqnum = 0, realtime;
2156 sd_id128_t seqnum_id;
2159 k = readdir_r(d, &buf, &de);
2168 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2171 if (!S_ISREG(st.st_mode))
2174 q = strlen(de->d_name);
2176 if (endswith(de->d_name, ".journal")) {
2178 /* Vacuum archived files */
2180 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2183 if (de->d_name[q-8-16-1] != '-' ||
2184 de->d_name[q-8-16-1-16-1] != '-' ||
2185 de->d_name[q-8-16-1-16-1-32-1] != '@')
2188 p = strdup(de->d_name);
2194 de->d_name[q-8-16-1-16-1] = 0;
2195 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2200 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2207 } else if (endswith(de->d_name, ".journal~")) {
2208 unsigned long long tmp;
2210 /* Vacuum corrupted files */
2212 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2215 if (de->d_name[q-1-8-16-1] != '-' ||
2216 de->d_name[q-1-8-16-1-16-1] != '@')
2219 p = strdup(de->d_name);
2225 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2230 have_seqnum = false;
2234 if (n_list >= n_allocated) {
2235 struct vacuum_info *j;
2237 n_allocated = MAX(n_allocated * 2U, 8U);
2238 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2248 list[n_list].filename = p;
2249 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2250 list[n_list].seqnum = seqnum;
2251 list[n_list].realtime = realtime;
2252 list[n_list].seqnum_id = seqnum_id;
2253 list[n_list].have_seqnum = have_seqnum;
2255 sum += list[n_list].usage;
2260 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2262 for(i = 0; i < n_list; i++) {
2265 if (fstatvfs(dirfd(d), &ss) < 0) {
2270 if (sum <= max_use &&
2271 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2274 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2275 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2276 sum -= list[i].usage;
2277 } else if (errno != ENOENT)
2278 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2282 for (i = 0; i < n_list; i++)
2283 free(list[i].filename);
2293 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2295 uint64_t q, xor_hash = 0;
2308 ts.monotonic = le64toh(o->entry.monotonic);
2309 ts.realtime = le64toh(o->entry.realtime);
2311 if (to->tail_entry_monotonic_valid &&
2312 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2315 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2318 n = journal_file_entry_n_items(o);
2319 items = alloca(sizeof(EntryItem) * n);
2321 for (i = 0; i < n; i++) {
2328 q = le64toh(o->entry.items[i].object_offset);
2329 le_hash = o->entry.items[i].hash;
2331 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2335 if (le_hash != o->data.hash)
2338 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2341 /* We hit the limit on 32bit machines */
2342 if ((uint64_t) t != l)
2345 if (o->object.flags & OBJECT_COMPRESSED) {
2349 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2352 data = from->compress_buffer;
2355 return -EPROTONOSUPPORT;
2358 data = o->data.payload;
2360 r = journal_file_append_data(to, data, l, &u, &h);
2364 xor_hash ^= le64toh(u->data.hash);
2365 items[i].object_offset = htole64(h);
2366 items[i].hash = u->data.hash;
2368 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2373 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2376 void journal_default_metrics(JournalMetrics *m, int fd) {
2377 uint64_t fs_size = 0;
2379 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2384 if (fstatvfs(fd, &ss) >= 0)
2385 fs_size = ss.f_frsize * ss.f_blocks;
2387 if (m->max_use == (uint64_t) -1) {
2390 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2392 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2393 m->max_use = DEFAULT_MAX_USE_UPPER;
2395 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2396 m->max_use = DEFAULT_MAX_USE_LOWER;
2398 m->max_use = DEFAULT_MAX_USE_LOWER;
2400 m->max_use = PAGE_ALIGN(m->max_use);
2402 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2403 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2406 if (m->max_size == (uint64_t) -1) {
2407 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2409 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2410 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2412 m->max_size = PAGE_ALIGN(m->max_size);
2414 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2415 m->max_size = JOURNAL_FILE_SIZE_MIN;
2417 if (m->max_size*2 > m->max_use)
2418 m->max_use = m->max_size*2;
2420 if (m->min_size == (uint64_t) -1)
2421 m->min_size = JOURNAL_FILE_SIZE_MIN;
2423 m->min_size = PAGE_ALIGN(m->min_size);
2425 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2426 m->min_size = JOURNAL_FILE_SIZE_MIN;
2428 if (m->min_size > m->max_size)
2429 m->max_size = m->min_size;
2432 if (m->keep_free == (uint64_t) -1) {
2435 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2437 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2438 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2441 m->keep_free = DEFAULT_KEEP_FREE;
2444 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2445 format_bytes(a, sizeof(a), m->max_use),
2446 format_bytes(b, sizeof(b), m->max_size),
2447 format_bytes(c, sizeof(c), m->min_size),
2448 format_bytes(d, sizeof(d), m->keep_free));
2451 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
2459 r = journal_file_next_entry(f, NULL, 0, DIRECTION_DOWN, &o, NULL);
2463 *from = le64toh(o->entry.realtime);
2467 r = journal_file_next_entry(f, NULL, 0, DIRECTION_UP, &o, NULL);
2471 *to = le64toh(o->entry.realtime);
2477 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2478 char t[9+32+1] = "_BOOT_ID=";
2486 sd_id128_to_string(boot_id, t + 9);
2488 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2492 if (le64toh(o->data.n_entries) <= 0)
2496 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2500 *from = le64toh(o->entry.monotonic);
2504 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2508 r = generic_array_get_plus_one(f,
2509 le64toh(o->data.entry_offset),
2510 le64toh(o->data.entry_array_offset),
2511 le64toh(o->data.n_entries)-1,
2516 *to = le64toh(o->entry.monotonic);