1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2011 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
26 #include <sys/statvfs.h>
30 #include "sd-journal.h"
31 #include "journal-def.h"
32 #include "journal-private.h"
37 #define DEFAULT_ARENA_MAX_SIZE (16ULL*1024ULL*1024ULL*1024ULL)
38 #define DEFAULT_ARENA_MIN_SIZE (256ULL*1024ULL)
39 #define DEFAULT_ARENA_KEEP_FREE (1ULL*1024ULL*1024ULL)
41 #define DEFAULT_HASH_TABLE_SIZE (2047ULL*16ULL)
42 #define DEFAULT_BISECT_TABLE_SIZE ((DEFAULT_ARENA_MAX_SIZE/(64ULL*1024ULL))*8ULL)
44 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
50 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
52 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
54 void journal_file_close(JournalFile *f) {
58 close_nointr_nofail(f->fd);
61 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
63 if (f->hash_table_window)
64 munmap(f->hash_table_window, f->hash_table_window_size);
66 if (f->bisect_table_window)
67 munmap(f->bisect_table_window, f->bisect_table_window_size);
70 munmap(f->window, f->window_size);
76 static int journal_file_init_header(JournalFile *f) {
84 memcpy(h.signature, signature, 8);
85 h.arena_offset = htole64(ALIGN64(sizeof(h)));
86 h.arena_max_size = htole64(DEFAULT_ARENA_MAX_SIZE);
87 h.arena_min_size = htole64(DEFAULT_ARENA_MIN_SIZE);
88 h.arena_keep_free = htole64(DEFAULT_ARENA_KEEP_FREE);
90 r = sd_id128_randomize(&h.file_id);
94 k = pwrite(f->fd, &h, sizeof(h), 0);
104 static int journal_file_refresh_header(JournalFile *f) {
109 r = sd_id128_get_machine(&f->header->machine_id);
113 r = sd_id128_get_boot(&f->header->boot_id);
117 f->header->state = htole32(STATE_ONLINE);
121 static int journal_file_verify_header(JournalFile *f) {
124 if (memcmp(f->header, signature, 8))
127 if (f->header->incompatible_flags != 0)
128 return -EPROTONOSUPPORT;
130 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
135 sd_id128_t machine_id;
138 r = sd_id128_get_machine(&machine_id);
142 if (!sd_id128_equal(machine_id, f->header->machine_id))
145 state = le32toh(f->header->state);
147 if (state == STATE_ONLINE)
148 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
149 else if (state == STATE_ARCHIVED)
151 else if (state != STATE_OFFLINE)
152 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
158 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
160 uint64_t old_size, new_size;
164 if (offset < le64toh(f->header->arena_offset))
167 new_size = PAGE_ALIGN(offset + size);
169 /* We assume that this file is not sparse, and we know that
170 * for sure, since we alway call posix_fallocate()
174 le64toh(f->header->arena_offset) +
175 le64toh(f->header->arena_size);
177 if (old_size >= new_size)
180 asize = new_size - le64toh(f->header->arena_offset);
182 if (asize > le64toh(f->header->arena_min_size)) {
185 if (fstatvfs(f->fd, &svfs) >= 0) {
188 available = svfs.f_bfree * svfs.f_bsize;
190 if (available >= f->header->arena_keep_free)
191 available -= f->header->arena_keep_free;
195 if (new_size - old_size > available)
200 if (asize > le64toh(f->header->arena_max_size))
203 if (posix_fallocate(f->fd, 0, new_size) < 0)
206 if (fstat(f->fd, &f->last_stat) < 0)
209 f->header->arena_size = htole64(asize);
214 static int journal_file_map(
223 uint64_t woffset, wsize;
230 woffset = offset & ~((uint64_t) page_size() - 1ULL);
231 wsize = size + (offset - woffset);
232 wsize = PAGE_ALIGN(wsize);
234 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
235 if (window == MAP_FAILED)
247 *ret = (uint8_t*) window + (offset - woffset);
252 static int journal_file_move_to(JournalFile *f, uint64_t offset, uint64_t size, void **ret) {
260 if (_likely_(f->window &&
261 f->window_offset <= offset &&
262 f->window_offset+f->window_size >= offset + size)) {
264 *ret = (uint8_t*) f->window + (offset - f->window_offset);
269 if (munmap(f->window, f->window_size) < 0)
273 f->window_size = f->window_offset = 0;
276 if (size < DEFAULT_WINDOW_SIZE) {
277 /* If the default window size is larger then what was
278 * asked for extend the mapping a bit in the hope to
279 * minimize needed remappings later on. We add half
280 * the window space before and half behind the
281 * requested mapping */
283 delta = PAGE_ALIGN((DEFAULT_WINDOW_SIZE - size) / 2);
289 size += (DEFAULT_WINDOW_SIZE - delta);
293 r = journal_file_map(f,
295 &f->window, &f->window_offset, &f->window_size,
301 *ret = (uint8_t*) p + delta;
305 static bool verify_hash(Object *o) {
310 t = le64toh(o->object.type);
311 if (t == OBJECT_DATA) {
314 s = le64toh(o->object.size);
316 h1 = le64toh(o->data.hash);
317 h2 = hash64(o->data.payload, s - offsetof(Object, data.payload));
325 int journal_file_move_to_object(JournalFile *f, uint64_t offset, Object **ret) {
334 r = journal_file_move_to(f, offset, sizeof(ObjectHeader), &t);
339 s = le64toh(o->object.size);
341 if (s < sizeof(ObjectHeader))
344 if (s > sizeof(ObjectHeader)) {
345 r = journal_file_move_to(f, offset, s, &t);
359 static uint64_t journal_file_seqnum(JournalFile *f) {
364 r = le64toh(f->header->seqnum) + 1;
365 f->header->seqnum = htole64(r);
370 static int journal_file_append_object(JournalFile *f, uint64_t size, Object **ret, uint64_t *offset) {
377 assert(size >= sizeof(ObjectHeader));
381 p = le64toh(f->header->tail_object_offset);
384 p = le64toh(f->header->arena_offset);
386 r = journal_file_move_to_object(f, p, &tail);
390 p += ALIGN64(le64toh(tail->object.size));
393 r = journal_file_allocate(f, p, size);
397 r = journal_file_move_to(f, p, size, &t);
404 o->object.type = htole64(OBJECT_UNUSED);
405 zero(o->object.reserved);
406 o->object.size = htole64(size);
408 f->header->tail_object_offset = htole64(p);
409 if (f->header->head_object_offset == 0)
410 f->header->head_object_offset = htole64(p);
412 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
420 static int journal_file_setup_hash_table(JournalFile *f) {
427 s = DEFAULT_HASH_TABLE_SIZE;
428 r = journal_file_append_object(f, offsetof(Object, hash_table.table) + s, &o, &p);
432 o->object.type = htole64(OBJECT_HASH_TABLE);
433 memset(o->hash_table.table, 0, s);
435 f->header->hash_table_offset = htole64(p + offsetof(Object, hash_table.table));
436 f->header->hash_table_size = htole64(s);
441 static int journal_file_setup_bisect_table(JournalFile *f) {
448 s = DEFAULT_BISECT_TABLE_SIZE;
449 r = journal_file_append_object(f, offsetof(Object, bisect_table.table) + s, &o, &p);
453 o->object.type = htole64(OBJECT_BISECT_TABLE);
454 memset(o->bisect_table.table, 0, s);
456 f->header->bisect_table_offset = htole64(p + offsetof(Object, bisect_table.table));
457 f->header->bisect_table_size = htole64(s);
462 static int journal_file_map_hash_table(JournalFile *f) {
469 p = le64toh(f->header->hash_table_offset);
470 s = le64toh(f->header->hash_table_size);
472 r = journal_file_map(f,
474 &f->hash_table_window, NULL, &f->hash_table_window_size,
483 static int journal_file_map_bisect_table(JournalFile *f) {
490 p = le64toh(f->header->bisect_table_offset);
491 s = le64toh(f->header->bisect_table_size);
493 r = journal_file_map(f,
495 &f->bisect_table_window, NULL, &f->bisect_table_window_size,
505 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash_index) {
512 assert(o->object.type == htole64(OBJECT_DATA));
514 o->data.head_entry_offset = o->data.tail_entry_offset = 0;
515 o->data.next_hash_offset = 0;
517 p = le64toh(f->hash_table[hash_index].tail_hash_offset);
519 /* Only entry in the hash table is easy */
521 o->data.prev_hash_offset = 0;
522 f->hash_table[hash_index].head_hash_offset = htole64(offset);
524 o->data.prev_hash_offset = htole64(p);
526 /* Temporarily move back to the previous data object,
527 * to patch in pointer */
529 r = journal_file_move_to_object(f, p, &o);
533 o->data.next_hash_offset = offset;
535 r = journal_file_move_to_object(f, offset, &o);
540 f->hash_table[hash_index].tail_hash_offset = htole64(offset);
545 static int journal_file_append_data(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
546 uint64_t hash, h, p, np;
552 assert(data || size == 0);
554 osize = offsetof(Object, data.payload) + size;
556 hash = hash64(data, size);
557 h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
558 p = le64toh(f->hash_table[h].head_hash_offset);
561 /* Look for this data object in the hash table */
563 r = journal_file_move_to_object(f, p, &o);
567 if (le64toh(o->object.type) != OBJECT_DATA)
570 if (le64toh(o->object.size) == osize &&
571 memcmp(o->data.payload, data, size) == 0) {
573 if (le64toh(o->data.hash) != hash)
585 p = le64toh(o->data.next_hash_offset);
588 r = journal_file_append_object(f, osize, &o, &np);
592 o->object.type = htole64(OBJECT_DATA);
593 o->data.hash = htole64(hash);
594 memcpy(o->data.payload, data, size);
596 r = journal_file_link_data(f, o, np, h);
609 uint64_t journal_file_entry_n_items(Object *o) {
611 assert(o->object.type == htole64(OBJECT_ENTRY));
613 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
616 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
623 p = le64toh(o->entry.items[i].object_offset);
627 o->entry.items[i].next_entry_offset = 0;
629 /* Move to the data object */
630 r = journal_file_move_to_object(f, p, &o);
634 if (o->object.type != htole64(OBJECT_DATA))
637 q = le64toh(o->data.tail_entry_offset);
638 o->data.tail_entry_offset = htole64(offset);
641 o->data.head_entry_offset = htole64(offset);
645 /* Move to previous entry */
646 r = journal_file_move_to_object(f, q, &o);
650 if (o->object.type != htole64(OBJECT_ENTRY))
653 n = journal_file_entry_n_items(o);
654 for (j = 0; j < n; j++)
655 if (le64toh(o->entry.items[j].object_offset) == p)
661 o->entry.items[j].next_entry_offset = offset;
664 /* Move back to original entry */
665 r = journal_file_move_to_object(f, offset, &o);
669 o->entry.items[i].prev_entry_offset = q;
673 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
674 uint64_t p, i, n, k, a, b;
680 assert(o->object.type == htole64(OBJECT_ENTRY));
682 /* Link up the entry itself */
683 p = le64toh(f->header->tail_entry_offset);
685 o->entry.prev_entry_offset = f->header->tail_entry_offset;
686 o->entry.next_entry_offset = 0;
689 f->header->head_entry_offset = htole64(offset);
691 /* Temporarily move back to the previous entry, to
692 * patch in pointer */
694 r = journal_file_move_to_object(f, p, &o);
698 o->entry.next_entry_offset = htole64(offset);
700 r = journal_file_move_to_object(f, offset, &o);
705 f->header->tail_entry_offset = htole64(offset);
707 /* Link up the items */
708 n = journal_file_entry_n_items(o);
709 for (i = 0; i < n; i++) {
710 r = journal_file_link_entry_item(f, o, offset, i);
715 /* Link up the entry in the bisect table */
716 n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
717 k = le64toh(f->header->arena_max_size) / n;
719 a = (le64toh(f->header->last_bisect_offset) + k - 1) / k;
723 f->bisect_table[a] = htole64(offset);
725 f->header->last_bisect_offset = htole64(offset + le64toh(o->object.size));
730 static int journal_file_append_entry_internal(
732 const dual_timestamp *ts,
734 const EntryItem items[], unsigned n_items,
735 Object **ret, uint64_t *offset) {
742 assert(items || n_items == 0);
744 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
746 r = journal_file_append_object(f, osize, &o, &np);
750 o->object.type = htole64(OBJECT_ENTRY);
751 o->entry.seqnum = htole64(journal_file_seqnum(f));
752 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
753 o->entry.realtime = ts ? htole64(ts->realtime) : 0;
754 o->entry.monotonic = ts ? htole64(ts->monotonic) : 0;
755 o->entry.xor_hash = htole64(xor_hash);
757 r = journal_file_link_entry(f, o, np);
770 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, Object **ret, uint64_t *offset) {
774 uint64_t xor_hash = 0;
777 assert(iovec || n_iovec == 0);
779 items = new(EntryItem, n_iovec);
783 for (i = 0; i < n_iovec; i++) {
787 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
791 xor_hash ^= le64toh(o->data.hash);
792 items[i].object_offset = htole64(p);
795 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, ret, offset);
803 int journal_file_move_to_entry(JournalFile *f, uint64_t seqnum, Object **ret, uint64_t *offset) {
805 uint64_t lower, upper, p, n, k;
810 n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
811 k = le64toh(f->header->arena_max_size) / n;
814 upper = le64toh(f->header->last_bisect_offset)/k+1;
816 while (lower < upper) {
817 k = (upper + lower) / 2;
818 p = le64toh(f->bisect_table[k]);
825 r = journal_file_move_to_object(f, p, &o);
829 if (o->object.type != htole64(OBJECT_ENTRY))
832 if (o->entry.seqnum == seqnum) {
840 } else if (seqnum < o->entry.seqnum)
842 else if (seqnum > o->entry.seqnum)
846 assert(lower == upper);
851 /* The object we are looking for is between
852 * bisect_table[lower-1] and bisect_table[lower] */
854 p = le64toh(f->bisect_table[lower-1]);
857 r = journal_file_move_to_object(f, p, &o);
861 if (o->entry.seqnum == seqnum) {
870 } if (seqnum < o->entry.seqnum)
873 if (o->entry.next_entry_offset == 0)
876 p = le64toh(o->entry.next_entry_offset);
882 int journal_file_next_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
889 np = le64toh(f->header->head_entry_offset);
891 if (le64toh(o->object.type) != OBJECT_ENTRY)
894 np = le64toh(o->entry.next_entry_offset);
900 r = journal_file_move_to_object(f, np, &o);
904 if (le64toh(o->object.type) != OBJECT_ENTRY)
916 int journal_file_prev_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
923 np = le64toh(f->header->tail_entry_offset);
925 if (le64toh(o->object.type) != OBJECT_ENTRY)
928 np = le64toh(o->entry.prev_entry_offset);
934 r = journal_file_move_to_object(f, np, &o);
938 if (le64toh(o->object.type) != OBJECT_ENTRY)
950 int journal_file_find_first_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
951 uint64_t p, osize, hash, h;
955 assert(data || size == 0);
957 osize = offsetof(Object, data.payload) + size;
959 hash = hash64(data, size);
960 h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
961 p = le64toh(f->hash_table[h].head_hash_offset);
966 r = journal_file_move_to_object(f, p, &o);
970 if (le64toh(o->object.type) != OBJECT_DATA)
973 if (le64toh(o->object.size) == osize &&
974 memcmp(o->data.payload, data, size) == 0) {
976 if (le64toh(o->data.hash) != hash)
979 if (o->data.head_entry_offset == 0)
982 p = le64toh(o->data.head_entry_offset);
983 r = journal_file_move_to_object(f, p, &o);
987 if (le64toh(o->object.type) != OBJECT_ENTRY)
999 p = le64toh(o->data.next_hash_offset);
1005 int journal_file_find_last_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
1006 uint64_t p, osize, hash, h;
1010 assert(data || size == 0);
1012 osize = offsetof(Object, data.payload) + size;
1014 hash = hash64(data, size);
1015 h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
1016 p = le64toh(f->hash_table[h].tail_hash_offset);
1021 r = journal_file_move_to_object(f, p, &o);
1025 if (le64toh(o->object.type) != OBJECT_DATA)
1028 if (le64toh(o->object.size) == osize &&
1029 memcmp(o->data.payload, data, size) == 0) {
1031 if (le64toh(o->data.hash) != hash)
1034 if (o->data.tail_entry_offset == 0)
1037 p = le64toh(o->data.tail_entry_offset);
1038 r = journal_file_move_to_object(f, p, &o);
1042 if (le64toh(o->object.type) != OBJECT_ENTRY)
1054 p = le64toh(o->data.prev_hash_offset);
1060 void journal_file_dump(JournalFile *f) {
1061 char a[33], b[33], c[33];
1068 printf("File ID: %s\n"
1071 "Arena size: %llu\n",
1072 sd_id128_to_string(f->header->file_id, a),
1073 sd_id128_to_string(f->header->machine_id, b),
1074 sd_id128_to_string(f->header->boot_id, c),
1075 (unsigned long long) le64toh(f->header->arena_size));
1077 p = le64toh(f->header->head_object_offset);
1079 r = journal_file_move_to_object(f, p, &o);
1083 switch (o->object.type) {
1086 printf("Type: OBJECT_UNUSED\n");
1090 printf("Type: OBJECT_DATA\n");
1094 printf("Type: OBJECT_ENTRY %llu\n", (unsigned long long) le64toh(o->entry.seqnum));
1097 case OBJECT_HASH_TABLE:
1098 printf("Type: OBJECT_HASH_TABLE\n");
1101 case OBJECT_BISECT_TABLE:
1102 printf("Type: OBJECT_BISECT_TABLE\n");
1106 if (p == le64toh(f->header->tail_object_offset))
1109 p = p + ALIGN64(le64toh(o->object.size));
1114 log_error("File corrupt");
1117 int journal_file_open(
1121 JournalFile **ret) {
1125 bool newly_created = false;
1129 if ((flags & O_ACCMODE) != O_RDONLY &&
1130 (flags & O_ACCMODE) != O_RDWR)
1133 f = new0(JournalFile, 1);
1137 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1138 f->prot = prot_from_flags(flags);
1140 f->fd = open(fname, flags|O_CLOEXEC, mode);
1146 f->path = strdup(fname);
1152 if (fstat(f->fd, &f->last_stat) < 0) {
1157 if (f->last_stat.st_size == 0 && f->writable) {
1158 newly_created = true;
1160 r = journal_file_init_header(f);
1164 if (fstat(f->fd, &f->last_stat) < 0) {
1170 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1175 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1176 if (f->header == MAP_FAILED) {
1182 if (!newly_created) {
1183 r = journal_file_verify_header(f);
1189 r = journal_file_refresh_header(f);
1194 if (newly_created) {
1196 r = journal_file_setup_hash_table(f);
1200 r = journal_file_setup_bisect_table(f);
1205 r = journal_file_map_hash_table(f);
1209 r = journal_file_map_bisect_table(f);
1219 journal_file_close(f);
1224 int sd_journal_open(sd_journal **ret) {
1229 const char search_paths[] =
1230 "/run/log/journal\0"
1231 "/var/log/journal\0";
1235 j = new0(sd_journal, 1);
1239 j->files = hashmap_new(string_hash_func, string_compare_func);
1243 NULSTR_FOREACH(p, search_paths) {
1248 if (errno != ENOENT && r == 0)
1255 struct dirent buf, *de;
1259 k = readdir_r(d, &buf, &de);
1270 if (!dirent_is_file_with_suffix(de, ".journal"))
1273 fn = join(p, "/", de->d_name, NULL);
1280 k = journal_file_open(fn, O_RDONLY, 0, &f);
1288 k = hashmap_put(j->files, f->path, f);
1290 journal_file_close(f);
1304 sd_journal_close(j);
1309 void sd_journal_close(sd_journal *j) {
1315 while ((f = hashmap_steal_first(j->files)))
1316 journal_file_close(f);
1318 hashmap_free(j->files);