X-Git-Url: https://www.chiark.greenend.org.uk/ucgi/~ianmdlvl/git?p=elogind.git;a=blobdiff_plain;f=src%2Fjournal%2Fjournal-file.c;h=7f5d7c27b21d22823f4d4fa937cfb4cd20365cb1;hp=4a006d3bf3510b282b87678bf0cfa93e3bc5aeed;hb=71100051c5d351daac20610f3a4b8c14901088d8;hpb=440ee3665e252dc004e356da0f5b51ad26ea2cbe diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c index 4a006d3bf..7f5d7c27b 100644 --- a/src/journal/journal-file.c +++ b/src/journal/journal-file.c @@ -37,7 +37,26 @@ #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL) -#define COMPRESSION_SIZE_THRESHOLD (64ULL) +#define COMPRESSION_SIZE_THRESHOLD (512ULL) + +/* This is the minimum journal file size */ +#define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) + +/* These are the lower and upper bounds if we deduce the max_use value + * from the file system size */ +#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */ +#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */ + +/* This is the upper bound if we deduce max_size from max_use */ +#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */ + +/* This is the upper bound if we deduce the keep_free value from the + * file system size */ +#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */ + +/* This is the keep_free value when we can't determine the system + * size */ +#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */ static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' }; @@ -119,6 +138,9 @@ static int journal_file_refresh_header(JournalFile *f) { f->header->boot_id = boot_id; f->header->state = STATE_ONLINE; + + __sync_synchronize(); + return 0; } @@ -216,7 +238,7 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) if (fstat(f->fd, &f->last_stat) < 0) return -errno; - f->header->arena_size = new_size - htole64(f->header->arena_offset); + f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset)); return 0; } @@ -241,6 +263,10 @@ static int journal_file_map( wsize = size + (offset - woffset); wsize = PAGE_ALIGN(wsize); + /* Avoid SIGBUS on invalid accesses */ + if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size)) + return -EADDRNOTAVAIL; + window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset); if (window == MAP_FAILED) return -errno; @@ -260,7 +286,7 @@ static int journal_file_map( } static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) { - void *p; + void *p = NULL; uint64_t delta; int r; Window *w; @@ -270,6 +296,15 @@ static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_ assert(wt >= 0); assert(wt < _WINDOW_MAX); + if (offset + size > (uint64_t) f->last_stat.st_size) { + /* Hmm, out of range? Let's refresh the fstat() data + * first, before we trust that check. */ + + if (fstat(f->fd, &f->last_stat) < 0 || + offset + size > (uint64_t) f->last_stat.st_size) + return -EADDRNOTAVAIL; + } + w = f->windows + wt; if (_likely_(w->ptr && @@ -295,16 +330,22 @@ static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_ * the window space before and half behind the * requested mapping */ - delta = PAGE_ALIGN((DEFAULT_WINDOW_SIZE - size) / 2); + delta = (DEFAULT_WINDOW_SIZE - size) / 2; - if (offset < delta) + if (delta > offset) delta = offset; offset -= delta; - size += (DEFAULT_WINDOW_SIZE - delta); + size = DEFAULT_WINDOW_SIZE; } else delta = 0; + if (offset + size > (uint64_t) f->last_stat.st_size) + size = (uint64_t) f->last_stat.st_size - offset; + + if (size <= 0) + return -EADDRNOTAVAIL; + r = journal_file_map(f, offset, size, &w->ptr, &w->offset, &w->size, @@ -540,6 +581,8 @@ static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, ui assert(offset > 0); assert(o->object.type == OBJECT_DATA); + /* This might alter the window we are looking at */ + o->data.next_hash_offset = o->data.next_field_offset = 0; o->data.entry_offset = o->data.entry_array_offset = 0; o->data.n_entries = 0; @@ -550,18 +593,14 @@ static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, ui /* Only entry in the hash table is easy */ f->data_hash_table[h].head_hash_offset = htole64(offset); } else { - /* Temporarily move back to the previous data object, - * to patch in pointer */ + /* Move back to the previous data object, to patch in + * pointer */ r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); if (r < 0) return r; o->data.next_hash_offset = htole64(offset); - - r = journal_file_move_to_object(f, OBJECT_DATA, offset, &o); - if (r < 0) - return r; } f->data_hash_table[h].tail_hash_offset = htole64(offset); @@ -573,6 +612,7 @@ int journal_file_find_data_object_with_hash( JournalFile *f, const void *data, uint64_t size, uint64_t hash, Object **ret, uint64_t *offset) { + uint64_t p, osize, h; int r; @@ -595,7 +635,7 @@ int journal_file_find_data_object_with_hash( return r; if (le64toh(o->data.hash) != hash) - return -EBADMSG; + goto next; if (o->object.flags & OBJECT_COMPRESSED) { #ifdef HAVE_XZ @@ -637,6 +677,7 @@ int journal_file_find_data_object_with_hash( return 1; } + next: p = le64toh(o->data.next_hash_offset); } @@ -660,7 +701,11 @@ int journal_file_find_data_object( ret, offset); } -static int journal_file_append_data(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) { +static int journal_file_append_data( + JournalFile *f, + const void *data, uint64_t size, + Object **ret, uint64_t *offset) { + uint64_t hash, p; uint64_t osize; Object *o; @@ -718,6 +763,12 @@ static int journal_file_append_data(JournalFile *f, const void *data, uint64_t s if (r < 0) return r; + /* The linking might have altered the window, so let's + * refresh our pointer */ + r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); + if (r < 0) + return r; + if (ret) *ret = o; @@ -729,14 +780,14 @@ static int journal_file_append_data(JournalFile *f, const void *data, uint64_t s uint64_t journal_file_entry_n_items(Object *o) { assert(o); - assert(o->object.type == htole64(OBJECT_ENTRY)); + assert(o->object.type == OBJECT_ENTRY); return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem); } static uint64_t journal_file_entry_array_n_items(Object *o) { assert(o); - assert(o->object.type == htole64(OBJECT_ENTRY_ARRAY)); + assert(o->object.type == OBJECT_ENTRY_ARRAY); return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t); } @@ -791,7 +842,7 @@ static int link_entry_into_array(JournalFile *f, o->entry_array.items[i] = htole64(p); if (ap == 0) - *first = q; + *first = htole64(q); else { r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o); if (r < 0) @@ -824,7 +875,7 @@ static int link_entry_into_array_plus_one(JournalFile *f, else { uint64_t i; - i = le64toh(*idx) - 1; + i = htole64(le64toh(*idx) - 1); r = link_entry_into_array(f, first, &i, p); if (r < 0) return r; @@ -865,6 +916,8 @@ static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) { assert(offset > 0); assert(o->object.type == OBJECT_ENTRY); + __sync_synchronize(); + /* Link up the entry itself */ r = link_entry_into_array(f, &f->header->entry_array_offset, @@ -873,7 +926,7 @@ static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) { if (r < 0) return r; - log_error("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); + /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */ if (f->header->head_entry_realtime == 0) f->header->head_entry_realtime = o->entry.realtime; @@ -936,7 +989,7 @@ static int journal_file_append_entry_internal( return 0; } -static void journal_file_post_change(JournalFile *f) { +void journal_file_post_change(JournalFile *f) { assert(f); /* inotify() does not receive IN_MODIFY events from file @@ -972,12 +1025,7 @@ int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const st ts->monotonic < le64toh(f->header->tail_entry_monotonic)) return -EINVAL; - if (ts->realtime < le64toh(f->header->tail_entry_realtime)) - return -EINVAL; - - items = new(EntryItem, n_iovec); - if (!items) - return -ENOMEM; + items = alloca(sizeof(EntryItem) * n_iovec); for (i = 0; i < n_iovec; i++) { uint64_t p; @@ -985,7 +1033,7 @@ int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const st r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p); if (r < 0) - goto finish; + return r; xor_hash ^= le64toh(o->data.hash); items[i].object_offset = htole64(p); @@ -996,9 +1044,6 @@ int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const st journal_file_post_change(f); -finish: - free(items); - return r; } @@ -1008,7 +1053,7 @@ static int generic_array_get(JournalFile *f, Object **ret, uint64_t *offset) { Object *o; - uint64_t p, a; + uint64_t p = 0, a; int r; assert(f); @@ -1682,6 +1727,9 @@ int journal_file_open( (flags & O_ACCMODE) != O_RDWR) return -EINVAL; + if (!endswith(fname, ".journal")) + return -EINVAL; + f = new0(JournalFile, 1); if (!f) return -ENOMEM; @@ -1692,10 +1740,6 @@ int journal_file_open( f->writable = (flags & O_ACCMODE) != O_RDONLY; f->prot = prot_from_flags(flags); - f->metrics.max_size = DEFAULT_MAX_SIZE; - f->metrics.min_size = DEFAULT_MIN_SIZE; - f->metrics.keep_free = DEFAULT_KEEP_FREE; - f->path = strdup(fname); if (!f->path) { r = -ENOMEM; @@ -1799,7 +1843,7 @@ int journal_file_rotate(JournalFile **f) { l = strlen(old_file->path); - p = new(char, l + 1 + 16 + 1 + 32 + 1 + 16 + 1); + p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1); if (!p) return -ENOMEM; @@ -1826,6 +1870,46 @@ int journal_file_rotate(JournalFile **f) { return r; } +int journal_file_open_reliably( + const char *fname, + int flags, + mode_t mode, + JournalFile *template, + JournalFile **ret) { + + int r; + size_t l; + char *p; + + r = journal_file_open(fname, flags, mode, template, ret); + if (r != -EBADMSG) + return r; + + if ((flags & O_ACCMODE) == O_RDONLY) + return r; + + if (!(flags & O_CREAT)) + return r; + + /* The file is corrupted. Rotate it away and try it again (but only once) */ + + l = strlen(fname); + if (asprintf(&p, "%.*s@%016llx-%016llx.journal~", + (int) (l-8), fname, + (unsigned long long) now(CLOCK_REALTIME), + random_ull()) < 0) + return -ENOMEM; + + r = rename(fname, p); + free(p); + if (r < 0) + return -errno; + + log_warning("File %s corrupted, renaming and replacing.", fname); + + return journal_file_open(fname, flags, mode, template, ret); +} + struct vacuum_info { off_t usage; char *filename; @@ -1833,6 +1917,8 @@ struct vacuum_info { uint64_t realtime; sd_id128_t seqnum_id; uint64_t seqnum; + + bool have_seqnum; }; static int vacuum_compare(const void *_a, const void *_b) { @@ -1841,7 +1927,8 @@ static int vacuum_compare(const void *_a, const void *_b) { a = _a; b = _b; - if (sd_id128_equal(a->seqnum_id, b->seqnum_id)) { + if (a->have_seqnum && b->have_seqnum && + sd_id128_equal(a->seqnum_id, b->seqnum_id)) { if (a->seqnum < b->seqnum) return -1; else if (a->seqnum > b->seqnum) @@ -1854,8 +1941,10 @@ static int vacuum_compare(const void *_a, const void *_b) { return -1; else if (a->realtime > b->realtime) return 1; - else + else if (a->have_seqnum && b->have_seqnum) return memcmp(&a->seqnum_id, &b->seqnum_id, 16); + else + return strcmp(a->filename, b->filename); } int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) { @@ -1868,7 +1957,7 @@ int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t m assert(directory); if (max_use <= 0) - max_use = DEFAULT_MAX_USE; + return 0; d = opendir(directory); if (!d) @@ -1882,6 +1971,7 @@ int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t m char *p; unsigned long long seqnum, realtime; sd_id128_t seqnum_id; + bool have_seqnum; k = readdir_r(d, &buf, &de); if (k != 0) { @@ -1892,41 +1982,71 @@ int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t m if (!de) break; - if (!dirent_is_file_with_suffix(de, ".journal")) + if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) + continue; + + if (!S_ISREG(st.st_mode)) continue; q = strlen(de->d_name); - if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8) - continue; + if (endswith(de->d_name, ".journal")) { - if (de->d_name[q-8-16-1] != '-' || - de->d_name[q-8-16-1-16-1] != '-' || - de->d_name[q-8-16-1-16-1-32-1] != '@') - continue; + /* Vacuum archived files */ - if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) - continue; + if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8) + continue; - if (!S_ISREG(st.st_mode)) - continue; + if (de->d_name[q-8-16-1] != '-' || + de->d_name[q-8-16-1-16-1] != '-' || + de->d_name[q-8-16-1-16-1-32-1] != '@') + continue; - p = strdup(de->d_name); - if (!p) { - r = -ENOMEM; - goto finish; - } + p = strdup(de->d_name); + if (!p) { + r = -ENOMEM; + goto finish; + } - de->d_name[q-8-16-1-16-1] = 0; - if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) { - free(p); - continue; - } + de->d_name[q-8-16-1-16-1] = 0; + if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) { + free(p); + continue; + } + + if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) { + free(p); + continue; + } + + have_seqnum = true; + + } else if (endswith(de->d_name, ".journal~")) { + unsigned long long tmp; + + /* Vacuum corrupted files */ + + if (q < 1 + 16 + 1 + 16 + 8 + 1) + continue; + + if (de->d_name[q-1-8-16-1] != '-' || + de->d_name[q-1-8-16-1-16-1] != '@') + continue; + + p = strdup(de->d_name); + if (!p) { + r = -ENOMEM; + goto finish; + } + + if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) { + free(p); + continue; + } - if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) { - free(p); + have_seqnum = false; + } else continue; - } if (n_list >= n_allocated) { struct vacuum_info *j; @@ -1947,6 +2067,7 @@ int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t m list[n_list].seqnum = seqnum; list[n_list].realtime = realtime; list[n_list].seqnum_id = seqnum_id; + list[n_list].have_seqnum = have_seqnum; sum += list[n_list].usage; @@ -1985,3 +2106,160 @@ finish: return r; } + +int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) { + uint64_t i, n; + uint64_t q, xor_hash = 0; + int r; + EntryItem *items; + dual_timestamp ts; + + assert(from); + assert(to); + assert(o); + assert(p); + + if (!to->writable) + return -EPERM; + + ts.monotonic = le64toh(o->entry.monotonic); + ts.realtime = le64toh(o->entry.realtime); + + if (to->tail_entry_monotonic_valid && + ts.monotonic < le64toh(to->header->tail_entry_monotonic)) + return -EINVAL; + + if (ts.realtime < le64toh(to->header->tail_entry_realtime)) + return -EINVAL; + + n = journal_file_entry_n_items(o); + items = alloca(sizeof(EntryItem) * n); + + for (i = 0; i < n; i++) { + uint64_t le_hash, l, h; + size_t t; + void *data; + Object *u; + + q = le64toh(o->entry.items[i].object_offset); + le_hash = o->entry.items[i].hash; + + r = journal_file_move_to_object(from, OBJECT_DATA, q, &o); + if (r < 0) + return r; + + if (le_hash != o->data.hash) + return -EBADMSG; + + l = le64toh(o->object.size) - offsetof(Object, data.payload); + t = (size_t) l; + + /* We hit the limit on 32bit machines */ + if ((uint64_t) t != l) + return -E2BIG; + + if (o->object.flags & OBJECT_COMPRESSED) { +#ifdef HAVE_XZ + uint64_t rsize; + + if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize)) + return -EBADMSG; + + data = from->compress_buffer; + l = rsize; +#else + return -EPROTONOSUPPORT; +#endif + } else + data = o->data.payload; + + r = journal_file_append_data(to, data, l, &u, &h); + if (r < 0) + return r; + + xor_hash ^= le64toh(u->data.hash); + items[i].object_offset = htole64(h); + items[i].hash = u->data.hash; + + r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o); + if (r < 0) + return r; + } + + return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset); +} + +void journal_default_metrics(JournalMetrics *m, int fd) { + uint64_t fs_size = 0; + struct statvfs ss; + char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX]; + + assert(m); + assert(fd >= 0); + + if (fstatvfs(fd, &ss) >= 0) + fs_size = ss.f_frsize * ss.f_blocks; + + if (m->max_use == (uint64_t) -1) { + + if (fs_size > 0) { + m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */ + + if (m->max_use > DEFAULT_MAX_USE_UPPER) + m->max_use = DEFAULT_MAX_USE_UPPER; + + if (m->max_use < DEFAULT_MAX_USE_LOWER) + m->max_use = DEFAULT_MAX_USE_LOWER; + } else + m->max_use = DEFAULT_MAX_USE_LOWER; + } else { + m->max_use = PAGE_ALIGN(m->max_use); + + if (m->max_use < JOURNAL_FILE_SIZE_MIN*2) + m->max_use = JOURNAL_FILE_SIZE_MIN*2; + } + + if (m->max_size == (uint64_t) -1) { + m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */ + + if (m->max_size > DEFAULT_MAX_SIZE_UPPER) + m->max_size = DEFAULT_MAX_SIZE_UPPER; + } else + m->max_size = PAGE_ALIGN(m->max_size); + + if (m->max_size < JOURNAL_FILE_SIZE_MIN) + m->max_size = JOURNAL_FILE_SIZE_MIN; + + if (m->max_size*2 > m->max_use) + m->max_use = m->max_size*2; + + if (m->min_size == (uint64_t) -1) + m->min_size = JOURNAL_FILE_SIZE_MIN; + else { + m->min_size = PAGE_ALIGN(m->min_size); + + if (m->min_size < JOURNAL_FILE_SIZE_MIN) + m->min_size = JOURNAL_FILE_SIZE_MIN; + + if (m->min_size > m->max_size) + m->max_size = m->min_size; + } + + if (m->keep_free == (uint64_t) -1) { + + if (fs_size > 0) { + m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */ + + if (m->keep_free > DEFAULT_KEEP_FREE_UPPER) + m->keep_free = DEFAULT_KEEP_FREE_UPPER; + + } else + m->keep_free = DEFAULT_KEEP_FREE; + } + + log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s", + format_bytes(a, sizeof(a), m->max_use), + format_bytes(b, sizeof(b), m->max_size), + format_bytes(c, sizeof(c), m->min_size), + format_bytes(d, sizeof(d), m->keep_free)); +}