#include <sys/statvfs.h>
#include <fcntl.h>
#include <stddef.h>
+#include <linux/fs.h>
+#include "btrfs-util.h"
#include "journal-def.h"
#include "journal-file.h"
#include "journal-authenticate.h"
#include "lookup3.h"
#include "compress.h"
-#include "fsprg.h"
#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
/* How much to increase the journal file size at once each time we allocate something new. */
#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
+/* Reread fstat() of the file for detecting deletions at least this often */
+#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
+
/* The mmap context to use for the header we pick as one above the last defined typed */
#define CONTEXT_HEADER _OBJECT_TYPE_MAX
if (f->mmap && f->fd >= 0)
mmap_cache_close_fd(f->mmap, f->fd);
+ if (f->fd >= 0 && f->defrag_on_close) {
+
+ /* Be friendly to btrfs: turn COW back on again now,
+ * and defragment the file. We won't write to the file
+ * ever again, hence remove all fragmentation, and
+ * reenable all the good bits COW usually provides
+ * (such as data checksumming). */
+
+ (void) chattr_fd(f->fd, false, FS_NOCOW_FL);
+ (void) btrfs_defrag_fd(f->fd);
+ }
+
safe_close(f->fd);
free(f->path);
} else if (state == STATE_ARCHIVED)
return -ESHUTDOWN;
else if (state != STATE_OFFLINE) {
- log_debug("Journal file %s has unknown state %u.", f->path, state);
+ log_debug("Journal file %s has unknown state %i.", f->path, state);
return -EBUSY;
}
}
return 0;
}
+static int journal_file_fstat(JournalFile *f) {
+ assert(f);
+ assert(f->fd >= 0);
+
+ if (fstat(f->fd, &f->last_stat) < 0)
+ return -errno;
+
+ f->last_stat_usec = now(CLOCK_MONOTONIC);
+
+ /* Refuse appending to files that are already deleted */
+ if (f->last_stat.st_nlink <= 0)
+ return -EIDRM;
+
+ return 0;
+}
+
static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
uint64_t old_size, new_size;
int r;
if (new_size < le64toh(f->header->header_size))
new_size = le64toh(f->header->header_size);
- if (new_size <= old_size)
- return 0;
+ if (new_size <= old_size) {
+
+ /* We already pre-allocated enough space, but before
+ * we write to it, let's check with fstat() if the
+ * file got deleted, in order make sure we don't throw
+ * away the data immediately. Don't check fstat() for
+ * all writes though, but only once ever 10s. */
+
+ if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
+ return 0;
+
+ return journal_file_fstat(f);
+ }
+
+ /* Allocate more space. */
if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
return -E2BIG;
if (r != 0)
return -r;
- if (fstat(f->fd, &f->last_stat) < 0)
- return -errno;
-
f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
- return 0;
+ return journal_file_fstat(f);
}
static unsigned type_to_context(ObjectType type) {
}
static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
+ int r;
+
assert(f);
assert(ret);
/* Hmm, out of range? Let's refresh the fstat() data
* first, before we trust that check. */
- if (fstat(f->fd, &f->last_stat) < 0 ||
- offset + size > (uint64_t) f->last_stat.st_size)
+ r = journal_file_fstat(f);
+ if (r < 0)
+ return r;
+
+ if (offset + size > (uint64_t) f->last_stat.st_size)
return -EADDRNOTAVAIL;
}
if (o->object.flags & OBJECT_COMPRESSION_MASK) {
#if defined(HAVE_XZ) || defined(HAVE_LZ4)
uint64_t l;
- size_t rsize;
+ size_t rsize = 0;
l = le64toh(o->object.size);
if (l <= offsetof(Object, data.payload))
#if defined(HAVE_XZ) || defined(HAVE_LZ4)
if (f->compress_xz &&
size >= COMPRESSION_SIZE_THRESHOLD) {
- size_t rsize;
+ size_t rsize = 0;
compression = compress_blob(data, size, o->data.payload, &rsize);
return TEST_RIGHT;
}
-static inline int find_data_object_by_boot_id(
+static int find_data_object_by_boot_id(
JournalFile *f,
sd_id128_t boot_id,
Object **o,
uint64_t *b) {
+
char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
sd_id128_to_string(boot_id, t + 9);
f->current_xor_hash = 0;
}
-void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
- f->last_direction = direction;
+void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
f->location_type = LOCATION_SEEK;
f->current_offset = offset;
f->current_seqnum = le64toh(o->entry.seqnum);
break;
default:
- printf("Type: unknown (%u)\n", o->object.type);
+ printf("Type: unknown (%i)\n", o->object.type);
break;
}
goto fail;
}
- if (fstat(f->fd, &f->last_stat) < 0) {
- r = -errno;
+ r = journal_file_fstat(f);
+ if (r < 0)
goto fail;
- }
if (f->last_stat.st_size == 0 && f->writable) {
+
+ /* Before we write anything, turn off COW logic. Given
+ * our write pattern that is quite unfriendly to COW
+ * file systems this should greatly improve
+ * performance on COW file systems, such as btrfs, at
+ * the expense of data integrity features (which
+ * shouldn't be too bad, given that we do our own
+ * checksumming). */
+ r = chattr_fd(f->fd, true, FS_NOCOW_FL);
+ if (r < 0 && r != -ENOTTY)
+ log_warning_errno(r, "Failed to set file attributes: %m");
+
/* Let's attach the creation time to the journal file,
* so that the vacuuming code knows the age of this
* file even if the file might end up corrupted one
* attributes are not supported we'll just skip this,
* and rely solely on mtime/atime/ctime of the file. */
- fd_setcrtime(f->fd, now(CLOCK_REALTIME));
+ fd_setcrtime(f->fd, 0);
#ifdef HAVE_GCRYPT
/* Try to load the FSPRG state, and if we can't, then
if (r < 0)
goto fail;
- if (fstat(f->fd, &f->last_stat) < 0) {
- r = -errno;
+ r = journal_file_fstat(f);
+ if (r < 0)
goto fail;
- }
newly_created = true;
}
}
r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
- if (r < 0) {
- r = -errno;
+ if (r < 0)
goto fail;
- }
f->header = h;
if (r < 0)
return -ENOMEM;
+ /* Try to rename the file to the archived version. If the file
+ * already was deleted, we'll get ENOENT, let's ignore that
+ * case. */
r = rename(old_file->path, p);
- if (r < 0)
+ if (r < 0 && errno != ENOENT)
return -errno;
old_file->header->state = STATE_ARCHIVED;
+ /* Currently, btrfs is not very good with out write patterns
+ * and fragments heavily. Let's defrag our journal files when
+ * we archive them */
+ old_file->defrag_on_close = true;
+
r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
journal_file_close(old_file);
r = journal_file_open(fname, flags, mode, compress, seal,
metrics, mmap_cache, template, ret);
- if (r != -EBADMSG && /* corrupted */
- r != -ENODATA && /* truncated */
- r != -EHOSTDOWN && /* other machine */
- r != -EPROTONOSUPPORT && /* incompatible feature */
- r != -EBUSY && /* unclean shutdown */
- r != -ESHUTDOWN && /* already archived */
- r != -EIO /* IO error, including SIGBUS on mmap */)
+ if (!IN_SET(r,
+ -EBADMSG, /* corrupted */
+ -ENODATA, /* truncated */
+ -EHOSTDOWN, /* other machine */
+ -EPROTONOSUPPORT, /* incompatible feature */
+ -EBUSY, /* unclean shutdown */
+ -ESHUTDOWN, /* already archived */
+ -EIO, /* IO error, including SIGBUS on mmap */
+ -EIDRM /* File has been deleted */))
return r;
if ((flags & O_ACCMODE) == O_RDONLY)
/* The file is corrupted. Rotate it away and try it again (but only once) */
l = strlen(fname);
- if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
+ if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
(int) l - 8, fname,
- (unsigned long long) now(CLOCK_REALTIME),
+ now(CLOCK_REALTIME),
random_u64()) < 0)
return -ENOMEM;
if (r < 0)
return -errno;
+ /* btrfs doesn't cope well with our write pattern and
+ * fragments heavily. Let's defrag all files we rotate */
+
+ (void) chattr_path(p, false, FS_NOCOW_FL);
+ (void) btrfs_defrag(p);
+
log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
return journal_file_open(fname, flags, mode, compress, seal,
if (o->object.flags & OBJECT_COMPRESSION_MASK) {
#if defined(HAVE_XZ) || defined(HAVE_LZ4)
- size_t rsize;
+ size_t rsize = 0;
r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);