--- /dev/null
+*.o
+*.coquet
+a.out
--- /dev/null
+{
+ "cSpell.ignoreWords": [
+ "dbname"
+ ],
+ "cSpell.words": [
+ "HEAPERR",
+ "LMODE",
+ "NURS"
+ ],
+ "files.associations": {
+ "coquet.h": "c"
+ }
+}
\ No newline at end of file
--- /dev/null
+TARGET = coquet
+LIBS =
+CC = gcc
+CFLAGS = -g -Wall --std=c99
+FILES = main.c posix.c util.c coquet.c
+
+.PHONY: default all clean
+
+default: $(TARGET)
+all: default
+
+OBJECTS = $(patsubst %.c, %.o, $(FILES))
+HEADERS = $(wildcard *.h)
+
+%.o: %.c $(HEADERS)
+ $(CC) $(CFLAGS) -c $< -o $@
+
+.PRECIOUS: $(TARGET) $(OBJECTS)
+
+$(TARGET): $(OBJECTS)
+ $(CC) $(OBJECTS) -Wall $(LIBS) -o $@
+
+clean:
+ -rm -f *.o
+ -rm -f $(TARGET)
\ No newline at end of file
--- /dev/null
+#ifndef CONSTANTS_H
+#define CONSTANTS_H
+
+#include <stdint.h>
+
+#define COQUET_FILE_MAIN 0
+#define COQUET_FILE_OLD 1
+#define COQUET_FILE_TMP 2
+
+#define COQUET_RET_OK 0
+#define COQUET_RET_VFSERR 1
+#define COQUET_RET_HEAPERR 2
+#define COQUET_RET_LOCKED 3
+#define COQUET_UNUSED_ERROR 4
+#define COQUET_LAST_ERROR 4
+
+#define COQUET_CMODE_OPEN 0
+#define COQUET_CMODE_CREATE 1
+#define COQUET_CMODE_EITHER 2
+
+#define COQUET_LOCK_WRITE 0
+#define COQUET_LOCK_ALIVE 1
+#define COQUET_LOCK_NURS_A 2
+#define COQUET_LOCK_NURS_B 3
+#define COQUET_LOCK_LAST 3
+
+#define COQUET_LMODE_EXCL 0
+#define COQUET_LMODE_SHARE 1
+#define COQUET_LMODE_UN 2
+
+typedef int bool_t;
+
+#endif
--- /dev/null
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <string.h>
+#include "constants.h"
+#include "coquet.h"
+#include "util.h"
+
+extern vfs_t vfs_posix;
+
+/* VFS null is used as a placeholder prior to initialisation, so that
+ * subsequent tidies can be clean.
+ */
+vfs_t vfs_null = {
+ .make = NULL,
+ .start = NULL,
+ .get_error_text = NULL,
+ .lock = NULL,
+ .open = NULL,
+ .close = NULL,
+ .write = NULL,
+ .read = NULL,
+ .finish = NULL
+};
+
+/* Allocate and initialise vfs layer. */
+static int init_vfs(coquet_t *cq, char *basename) {
+ int r;
+
+ cq->vfs_funcs = vfs_posix;
+
+ /* Allocate memory */
+ cq->vfs_data = (cq->vfs_funcs.make)();
+ if(cq->vfs_data == NULL) {
+ cq->vfs_funcs = vfs_null;
+ return COQUET_RET_HEAPERR;
+ }
+
+ /* Initialise */
+ r = (cq->vfs_funcs.start)(cq->vfs_data,basename);
+ if(r != COQUET_RET_OK) {
+ return r;
+ }
+
+ return COQUET_RET_OK;
+}
+
+/* Initialise whole library. Returns NULL on malloc failure. cq must
+ * point to valid memory area for coquet_t, but does not require any
+ * content.
+ */
+int coquet_init(coquet_t *cq, char * basename) {
+ int r;
+
+ r = init_vfs(cq,basename);
+ if(r != COQUET_RET_OK) {
+ return r;
+ }
+
+ return COQUET_RET_OK;
+}
+
+/* Teardown and free vfs layer. */
+static int finish_vfs(coquet_t *cq) {
+ int r;
+
+ if(cq->vfs_funcs.make != NULL) {
+ r = (cq->vfs_funcs.finish)(cq->vfs_data);
+ if(r != COQUET_RET_OK) {
+ return r;
+ }
+ cq->vfs_funcs.make = NULL;
+ }
+
+ return COQUET_RET_OK;
+}
+
+/* Tear down whole library. Idempotent to allow crash recovery. */
+int coquet_finish(coquet_t *cq) {
+ int r;
+
+ r = finish_vfs(cq);
+ if(r != COQUET_RET_OK) {
+ return r;
+ }
+
+ return COQUET_RET_OK;
+}
+
+char * error_strings[COQUET_UNUSED_ERROR+1] = {
+ /* COQUET_RET_OK */ "Ok",
+ /* COQUET_RET_VFS */ "Unknown VFS Error",
+ /* COQUET_RET_HEAPERR */ "Heap allocation failed",
+ /* COQUET_RET_LOCKED */ "Locked",
+ /* COQUET_UNUSED_ERROR */ "No such error code"
+};
+
+/* Return error string for most recent error. Returned string is owned by
+ * caller and must be freed. May return NULL if heap is exhausted.
+ */
+char * coquet_error_string(coquet_t *cq, int error) {
+ if(error > COQUET_LAST_ERROR || error < 0) {
+ error = COQUET_UNUSED_ERROR;
+ }
+ if(error == COQUET_RET_VFSERR) {
+ return (cq->vfs_funcs.get_error_text)(cq->vfs_data);
+ } else {
+ return strdup(error_strings[error]);
+ }
+}
--- /dev/null
+#ifndef COQUET_H
+#define COQUET_H
+
+#include "constants.h"
+#include "vfs.h"
+
+typedef struct coquet {
+
+ /* VFS */
+ vfs_t vfs_funcs;
+ void *vfs_data;
+
+} coquet_t;
+
+/* Initialise whole library. Returns NULL on malloc failure. cq must
+ * point to valid memory area for coquet_t, but does not require any
+ * content.
+ */
+int coquet_init(coquet_t *cq, char * basename);
+
+/* Tear down whole library. Idempotent to allow crash recovery. */
+int coquet_finish(coquet_t *cq);
+
+/* Return error string for most recent error. Returned string is owned by
+ * caller and must be freed. May return NULL if heap is exhausted.
+ */
+char * coquet_error_string(coquet_t *cq, int error);
+
+#endif
--- /dev/null
+Introduction
+============
+
+Coquet is a from-scratch design of an in-process database which takes design elements from both traditional SQL databases and append-only technologies.
+
+It can be described as an "append-mainly" database. From append-only designs it ensures that:
+
+* any copy of the file, arbitrarily truncated is a valid coquet file from some point in the past;
+
+* reads don't interfere with writes in any way, always seeing a consistent snapshot of the data.
+
+However, it implements a standard collection of table and index trees, like SQL, and takes considerably more pains than is typical to rein in the growth from append-only implementations.
+
+Unlike SQL, however, coquet directly exposes the primitive operations and requires the developer to plan queries. In this developer's experience, query planners rapidly degenerate into a game of "telephone", where the developer has a certain plan in mind and needs to second-guess the query planner's internals, so that it plans as the developer intend. coquet cuts out these imperfect transformations, and lets the developer specify the plan. That a developer must explicitly write the query method moves the query plan into code, for maintainability, and exposes weaknesses at the time of writing.
+
+coquet is also designed to support arbitrary user-created queries with little danger, including APIs to implement fair-share and timeout algorithms to prevent queries taking too many resources.
+
+coquet *does* make changes to the database file other than at the end, making it *not* append-only. However, it is guaranteed that these changes are such that at any moment any other process can read or copy the database file, even without locking it first, and receive a valid database state at some moment in time.
+
+The non-append actions relate to the nursery-area, a journal to improve write efficiency in various ways.
+
+The nursery-area is at a fixed location near the start of the file. It is a write-ahead log, designed in such a way that reading its data, even *during* updates to the log, gives you the valid log state at some point in the near past, at least at the most recent commit which has been followed by a filesystem sync, perhaps also including unsynced commits -- and therefore always a valid database at that point (or later).
+
+The nursery-area could have been a separate file. However, it was deemed more important that a single file contain all persisted state for ease of administration.
+
+As a write-ahead log, the nursery does place a performance obligation on readers. Again, given our pessimistic approach to systems administration, the default implementation does not use exotic shared memory to speed this up, but takes the performance hit.
+
+Filesystem Guarantees
+=====================
+
+coquet can work with various levels of filesystem guarantees, which trade off against performance. Even in its most liberal mode it requires very little from the filesystem:
+
+1. that writes to different pages do not interfere;
+
+2. that writes of an entire valid block, once read by one process as having this new value never revert to their old value, even before a sync.
+
+It acknowledges that:
+
+1. writes can be torn;
+2. while torn, writes may contain garbage.
+
+It *does* require that once a block is persisted, in the sense that a read picks up a new value, it never again picks up the old value (assuming the coincidence of the old value to garbage is remote).
+
+[XXX shared memory]
+[XXX nfs etc]
+
+Files on Disk
+=============
+
+Each coquet database can cause the creation of up to three files.
+
+dbname.coquet -- the current database
+
+dbname.coquet.old -- this was once the current database, but it has since been compacted (into the current dbname.coquet) and some queries are ongoing which still use this old version. As soon as those queries complete, this file is deleted. If those queries should crash, it is tidied up by the next query to the database (of any type).
+
+dbname.coquet.tmp -- this is a new, compacted version of the database in preparation. It is not live, and only used during ongoing compaction. When compaction is complete, an atomic change will rename it dbname.coquet.
+
+Therefore, for almost all cases (including backups and replication purposes), only the dbname.coquet file is relevant, and dbname.old and dbname.tmp can be ignored.
+
+Note that very long queries place compaction into a degraded mode as a *.old file already exists. In degraded mode, compaction repeatedly prepares *.tmp files and discards them, so that when the queries finally terminate, the switchover is a current compaction. In degraded mode, if no readers have locked the live *.coquet file, the switchover is performed anyway.
+
+Locking
+=======
+
+Three guarantees are achieved by locking:
+
+1. No two writes can be live simultaneously.
+2. An old database file cannot be deleted while operations are ongoing.
+3. A nursery half is not deleted on flushing while being read by another process.
+
+The *write lock* is exclusively acquired by a writer before writing.
+
+The *file lock* is acquired shared by writers and readers before executing. The rename step acquires this lock exclusively before renaming. If it cannot be acquired, the rename step is left for a future operation rather than waited upon.
+
+One of the pair of *nursery locks* is acquired exclusively by writers while deleting that (non-live) nursery half for reuse (see later). This is almost never contended.
+
+Deadlocks are avoided by having a defined order of lock acquisition.
+
+File Structure
+==============
+
+The file begins with a superblock, describing various invariant settings for the database (such as block size). This is protected by a hash value. If the hash doesn't match the page contents (with the hash field zeroed) it is as if it doesn't exist. After the data in the superblock is the lock
+area. This is part of the page which operating-systems lacking discretionary locks can use for range-based mandatory locking, it is never read or written to after creation (the superblock read is short, for just this reason).
+
+Next is the nursery area. This is a fixed sized region for persisting small writes before incorporating them into the tail of the file. It aids performance by allowing multiple, temporally-adjacent writes to the same page to be eliminated before committing. The nursery is divided into two halves. At any moment in time exactly one nursery area is active and the other unused. When one half of the nursery area is full it is compacted into the other and the live half switched. When compaction does not yield sufficient gains, its contents are appended to the main database.
+
+After the nursery area is the main file area, written in standard append-only form, ie with a terminal root page and only backward page links in what the LISP-world calls a "linear" data-structure (though it's actually a directed acyclic graph).
+
+Nursery Operation
+=================
+
+The purpose of the nursery area is to prevent a large number of writes to hot pages making a database too large, since once in the main database file, those multiple writes will all persist until compaction. The operation of writing from the nursery to the main database, and so emptying it is known as flushing.
+
+Should a commit be too big for the nursery area, the nursery is first flushed, and the write performed directly against the main part of the database.
+
+The nursery is written in two halves. At any moment, one of these is live. When the live half becomes full, a compaction is performed on that half, the results being written to the other half. Then the halves are switched. This process is known as switching. If switching doesn't sufficiently compress the data, the nursery is flushed.
+
+Pages enter the nursery batched into commits, known as a book. The pages of a book are written in the order they will end up in the database (if they reach it), ie with only backward links, *except*, the final page, which is always the root, is placed at the physical *head* of the book. Most of the code treats it as if at had been placed the end. It is physically at the head in order that the code to read the nursery can check for validity of books, their boundaries, and so on.
+
+The contents of a book are protected by two hashes in the root node. The first covers the contents of the root node itself. If the hash doesn't match, then the book is not valid. The second covers all the data in the book. Similarly, no match means the book is not valid. These hashes are used to ensure that a book is definitively present-or-not (and not merely half-written) in the event of hashes, without a slow filesystem sync.
+
+The root page in the first book of each nursery half also has a value indicating which half is live. This integer is increased at each switch. During switching, the new half is first written with this book having a zero value. The final go-live operation of a switch is to update this page with the new value. If the write should fail prior to this point -- or during it such that nonsense is left behind (detected by the hash) -- it is as if the operation did not take place.
+
+Pages are stored in the nursery in a compressed format. During many writes to similar areas, a few pages "leading to the root" of the B-trees will be updated over and again, changing the value of a single pointer. To avoid such updates rapidly filling the nursery area, the compressed format indicates the source page and a single delta to a region of that page.
+
+Nursery page must be stored in-memory by all processes. Often it is this which effectively limits the size of the nursery. On read, a process reads the entire committed nursery and builds its in memory equivalents. Although nursery pages are often in the file cache, the slowness of this process can also effectively limit nursery size. A read with knowledge of recent, earlier nursery state can use that knowledge, inspection of the current integers in the head of each half, and a read of the first previously-invalid block to determine whether it is up to date and if not, often apply a delta to that knowledge rather than a full reload.
+
+One of the pair of *nursery locks* is momentarily acquired exclusively by writers before writing into a new nursery half. This ensures that no very slow readers are still using this half. If the lock cannot be acquired, the write process abandons the switch and flushes direct to the database. This should be extremely rare, as it would require a nursery read to extend over two nursery switches, but is necessary for correctness. If no reads are occurring at this point certainly no more will occur, as during acquisition the other half is live and synced, and so new processes will always read from the live half.
+
+B-Tree Structure
+================
+
+Data is persisted in a series of B-trees. Each B-tree have a fixed key and value schema. B-trees are assembled into "tables" which include a data tree and potentially multiple index trees. These trees are organised by the table B-tree.
+
+Among the types of values for a tree is a pointer type which points to a root page of another B-tree (this type is not available to end-users). The root tree is a single, distinguished B-tree. It acts as a namespace to table trees, and root for the file. Therefore, the path from the root to the data is:
+
+1. through the root B-tree to the table;
+2. through the table B-tree to the data or index;
+3. through the data or index to the data.
+
+The schema of the root and table B-trees is fixed. The schema of data and index B-trees is stored as a field in their table B-tree.
+
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include "coquet.h"
+#include "vfs.h"
+
+void bail(coquet_t * cq, int error_code) {
+ char *msg;
+
+ if(error_code == COQUET_RET_OK) {
+ return;
+ }
+ msg = coquet_error_string(cq,error_code);
+ if(!msg) {
+ fprintf(stderr,"Heap exhausted");
+ exit(1);
+ }
+ fprintf(stderr,"%s\n",msg);
+ free(msg);
+ coquet_finish(cq);
+ exit(1);
+}
+
+int main() {
+ coquet_t cq;
+ int i,r;
+ char buf[10];
+
+ r = coquet_init(&cq,"test");
+ bail(&cq,r);
+ r = (cq.vfs_funcs.open)(cq.vfs_data,COQUET_FILE_MAIN,1);
+ bail(&cq,r);
+ r = (cq.vfs_funcs.write)(cq.vfs_data,COQUET_FILE_MAIN,"hello",20,5);
+ bail(&cq,r);
+ r = (cq.vfs_funcs.read)(cq.vfs_data,COQUET_FILE_MAIN,buf,18,10);
+ bail(&cq,r);
+ r = (cq.vfs_funcs.lock)(cq.vfs_data,COQUET_LOCK_ALIVE,COQUET_LMODE_EXCL,1);
+ bail(&cq,r);
+ for(i=0;i<10;i++) {
+ printf("%d %d\n",i,buf[i]);
+ }
+ r = (cq.vfs_funcs.lock)(cq.vfs_data,COQUET_LOCK_ALIVE,COQUET_LMODE_UN,1);
+ bail(&cq,r);
+ r = (cq.vfs_funcs.close)(cq.vfs_data,COQUET_FILE_MAIN);
+ bail(&cq,r);
+ r = coquet_finish(&cq);
+ bail(&cq,r);
+ return 0;
+}
--- /dev/null
+#define _GNU_SOURCE
+#include <stdio.h> /* During testing */
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/file.h>
+#include "vfs.h"
+#include "coquet.h"
+#include "util.h"
+
+struct posix_data {
+ char * filename;
+ char * dirname_buf, * dirname;
+ int seen_error;
+ char * error_text;
+ int main_fd, tmp_fd, old_fd;
+};
+
+/* Set message to be returned by posix_get_error_text. Passed string
+ * remains owned by caller.
+ */
+static void set_error(struct posix_data * pd, char *error,
+ bool_t use_errno) {
+ pd->seen_error = 1;
+ if(pd->error_text != NULL) {
+ free(pd->error_text);
+ }
+ if(use_errno) {
+ pd->error_text = cq_message("%s: %s",error,strerror(errno));
+ } else {
+ pd->error_text = strdup(error);
+ }
+}
+
+static void * posix_make() {
+ struct posix_data * pd;
+
+ pd = malloc(sizeof(struct posix_data));
+ pd->seen_error = 0;
+ pd->error_text = NULL;
+ pd->filename = pd->dirname_buf = NULL;
+ pd->main_fd = pd->tmp_fd = pd->old_fd = -1;
+ return pd;
+}
+
+static int posix_start(void * vfs_data, char *filename) {
+ struct posix_data * pd = (struct posix_data *)vfs_data;
+
+ pd->filename = strdup(filename);
+ if(pd->filename == NULL) {
+ return COQUET_RET_HEAPERR;
+ }
+ pd->dirname_buf = strdup(pd->filename);
+ pd->dirname = dirname(pd->dirname_buf);
+ return COQUET_RET_OK;
+}
+
+static int posix_finish(void *vfs_data) {
+ struct posix_data * pd = (struct posix_data *)vfs_data;
+
+ if(pd->error_text) {
+ free(pd->error_text);
+ }
+ if(pd->filename) {
+ free(pd->filename);
+ }
+ if(pd->dirname_buf) {
+ free(pd->dirname_buf);
+ }
+ free(pd);
+ return COQUET_RET_OK;
+}
+
+static char * posix_get_error_text(void * vfs_data) {
+ struct posix_data * pd = (struct posix_data *)vfs_data;
+
+ if(!pd->seen_error) {
+ return strdup("No VFS error occurred");
+ }
+ if(pd->error_text == NULL) {
+ return NULL; /* heap error */
+ }
+ return strdup(pd->error_text);
+}
+
+static char * filename(struct posix_data *pd, int which_file) {
+ switch(which_file) {
+ case COQUET_FILE_MAIN:
+ return cq_message("%s.coquet",pd->filename);
+ break;
+ case COQUET_FILE_OLD:
+ return cq_message("%s.coquet.old",pd->filename);
+ break;
+ case COQUET_FILE_TMP:
+ return cq_message("%s.coquet.tmp",pd->filename);
+ break;
+ default:
+ return NULL;
+ }
+}
+
+static int * file_fd(struct posix_data *pd, int which_file) {
+ switch(which_file) {
+ case COQUET_FILE_MAIN: return &pd->main_fd;
+ case COQUET_FILE_OLD: return &pd->old_fd;
+ case COQUET_FILE_TMP: return &pd->tmp_fd;
+ default: return NULL;
+ }
+}
+
+static int sync_dir(struct posix_data * pd) {
+ int r,fd;
+
+ if(pd->dirname == NULL) {
+ set_error(pd,"NULL dirname",0);
+ return COQUET_RET_VFSERR;
+ }
+
+ fd = open(pd->dirname,O_RDONLY);
+ if(fd==-1) {
+ set_error(pd,"Failed to open parent directory",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ r = fsync(fd);
+ if(r==-1) {
+ set_error(pd,"Failed to fsync parent directory",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ r = close(fd);
+ if(r==-1) {
+ set_error(pd,"Failed to close parent directory",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ return COQUET_RET_OK;
+}
+
+static int posix_open(void * vfs_data, int which_file,
+ bool_t allow_create) {
+ struct posix_data * pd = (struct posix_data *)vfs_data;
+ char *path;
+ int r, fd, flags, *fd_field;
+
+ if(pd->filename == NULL) {
+ set_error(pd,"NULL filename",0);
+ return COQUET_RET_VFSERR;
+ }
+
+ flags = O_RDWR;
+ if(allow_create) {
+ flags |= O_CREAT;
+ }
+
+ fd_field = file_fd(pd,which_file);
+ if(fd_field == NULL) {
+ set_error(pd,"Invalid file enum value",0);
+ return COQUET_RET_VFSERR;
+ }
+ if(*fd_field != -1) {
+ set_error(pd,"File already open",0);
+ return COQUET_RET_VFSERR;
+ }
+
+ path = filename(pd,which_file);
+ if(path == NULL) {
+ return COQUET_RET_HEAPERR;
+ }
+
+ fd = open(path,flags,0666);
+ free(path);
+
+ if(fd==-1) {
+ set_error(pd,"open failed",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ r = sync_dir(pd);
+ if(r) {
+ set_error(pd,"Failed to sync parent directory",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ *fd_field = fd;
+ return COQUET_RET_OK;
+}
+
+static int posix_close(void * vfs_data, int which_file) {
+ struct posix_data * pd = (struct posix_data *)vfs_data;
+ int *fd_field, r;
+
+ fd_field = file_fd(pd,which_file);
+ if(fd_field == NULL || *fd_field == -1) {
+ set_error(pd,"File not open",0);
+ return COQUET_RET_VFSERR;
+ }
+ r = close(*fd_field);
+ if(r==-1) {
+ set_error(pd,"close failed",1);
+ return COQUET_RET_VFSERR;
+ }
+ return COQUET_RET_OK;
+}
+
+static int posix_write(void * vfs_data, int which_file, char * data,
+ off_t offset, uint64_t length) {
+ struct posix_data * pd = (struct posix_data *)vfs_data;
+ int *fd, r;
+ off_t r_off;
+
+ fd = file_fd(pd,which_file);
+ if(fd == NULL || *fd == -1) {
+ set_error(pd,"File not open",0);
+ return COQUET_RET_VFSERR;
+ }
+
+ r_off = lseek(*fd,offset,SEEK_SET);
+ if(r_off == -1) {
+ set_error(pd,"seek failed",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ while(length>0) {
+ r = write(*fd,data,length);
+ if(r==-1) {
+ set_error(pd,"write failed",1);
+ return COQUET_RET_VFSERR;
+ }
+ data += r;
+ length -= r;
+ }
+
+ return COQUET_RET_OK;
+}
+
+static int posix_read(void * vfs_data, int which_file, char * data,
+ uint64_t offset, uint64_t length) {
+ struct posix_data * pd = (struct posix_data *)vfs_data;
+ int *fd, r;
+ off_t r_off;
+
+ fd = file_fd(pd,which_file);
+ if(fd == NULL || *fd == -1) {
+ set_error(pd,"File not open",0);
+ return COQUET_RET_VFSERR;
+ }
+
+ r_off = lseek(*fd,offset,SEEK_SET);
+ if(r_off == -1) {
+ set_error(pd,"seek failed",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ while(length>0) {
+ r = read(*fd,data,length);
+ if(r==-1) {
+ set_error(pd,"read failed",1);
+ return COQUET_RET_VFSERR;
+ }
+ data += r;
+ length -= r;
+ if(r==0) {
+ /* EOF */
+ memset(data,0,length);
+ break;
+ }
+ }
+
+ return COQUET_RET_OK;
+}
+
+/* main_fd must be open */
+static int min_size(struct posix_data *pd, off_t size) {
+ int r;
+ off_t r_off;
+ char buf[1] = {0};
+
+ if(size == 0) {
+ return COQUET_RET_OK;
+ }
+
+ r_off = lseek(pd->main_fd,size-1,SEEK_SET);
+ if(r_off == -1) {
+ set_error(pd,"seek failed",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ r = write(pd->main_fd,buf,1);
+ if(r == -1) {
+ set_error(pd,"write failed",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ return COQUET_RET_OK;
+}
+
+#define LOCK_BLOCK 512
+static int posix_lock(void * vfs_data, int which_lock, int lock_mode,
+ bool_t wait) {
+ struct posix_data * pd = (struct posix_data *)vfs_data;
+ int r, op;
+ struct flock flk;
+
+ if(pd->main_fd==-1) {
+ set_error(pd,"main file closed during locking",0);
+ return COQUET_RET_VFSERR;
+ }
+
+ /* Lock region needs to exist. We can append zeroes.*/
+ r = min_size(pd,(COQUET_LOCK_LAST+1)*LOCK_BLOCK);
+ if(r != COQUET_RET_OK) {
+ return r;
+ }
+
+ flk.l_start = which_lock * LOCK_BLOCK;
+ flk.l_whence = SEEK_SET;
+ flk.l_len = 16;
+ flk.l_pid = 0;
+
+ switch(lock_mode) {
+ case COQUET_LMODE_EXCL: flk.l_type = F_WRLCK; break;
+ case COQUET_LMODE_SHARE: flk.l_type = F_RDLCK; break;
+ case COQUET_LMODE_UN: flk.l_type = F_UNLCK; break;
+ default:
+ set_error(pd,"bad lock mode",0);
+ return COQUET_RET_VFSERR;
+ }
+
+ if(which_lock < 0 || which_lock > COQUET_LOCK_LAST) {
+ set_error(pd,"bad lock number",0);
+ return COQUET_RET_VFSERR;
+ }
+ if(wait) {
+ op = F_OFD_SETLKW;
+ } else {
+ op = F_OFD_SETLK;
+ }
+
+ r = fcntl(pd->main_fd,op,&flk);
+ if(r == -1) {
+ set_error(pd,"flock failed",1);
+ return COQUET_RET_VFSERR;
+ }
+
+ return COQUET_RET_OK;
+}
+
+vfs_t vfs_posix = {
+ .make = posix_make,
+ .start = posix_start,
+ .get_error_text = posix_get_error_text,
+ .lock = posix_lock,
+ .open = posix_open,
+ .close = posix_close,
+ .write = posix_write,
+ .read = posix_read,
+ .finish = posix_finish
+};
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include "coquet.h"
+
+/* from snprintf man-page */
+char * cq_message(const char *fmt, ...) {
+ int n = 0;
+ size_t size = 0;
+ char *p = NULL;
+ va_list ap;
+
+ /* Determine required size. */
+ va_start(ap, fmt);
+ n = vsnprintf(p, size, fmt, ap);
+ va_end(ap);
+
+ if (n < 0) {
+ return NULL;
+ }
+ size = (size_t) n + 1; /* One extra byte for '\0' */
+ p = malloc(size);
+ if (p == NULL) {
+ return NULL;
+ }
+ va_start(ap, fmt);
+ n = vsnprintf(p, size, fmt, ap);
+ va_end(ap);
+
+ if (n < 0) {
+ free(p);
+ return NULL;
+ }
+
+ return p;
+}
--- /dev/null
+#ifndef COQUET_UTIL_H
+#define COQUET_UTIL_H
+
+char * cq_message(const char *fmt, ...);
+
+#endif
\ No newline at end of file
--- /dev/null
+#ifndef COQUET_VFS_H
+#define COQUET_VFS_H
+
+#include <unistd.h>
+#include <sys/types.h>
+#include "constants.h"
+
+/* Where a function returns an int it must be COQUET_RET_OK,
+ * COQUET_RET_VFS or COQUET_RET_HEAPERR unless docs here indicate otherwise.
+*/
+typedef struct vfs {
+
+ /* Returning vfs_data payload which will then be
+ * owned by caller until passed to finish on shutdown. Initialise
+ * is then called. This is split in two to allow errors to be reported
+ * in the regular way by init. May be null if heap allocation failed.
+ */
+ void * (*make)();
+
+ /* Initialise VFS layer. Use basename as filepath in all calls.
+ * basename = a/b/foo => a/b/foo.coquet, a/b/foo.coquet.tmp, etc.
+ * basename is still owned by caller.
+ */
+ int (*start)(void * vfs_data, char * basename);
+
+ /* Get text of most recent error for display, etc. Caller must free.
+ * SHOULD never be called if no error, but MUST return some
+ * placeholder text in that case. If no memory available, NULL may
+ * be returned.
+ */
+ char * (*get_error_text)(void * vfs_data);
+
+ /* Lock or unlock the given lock according to lock mode. which_lock
+ * is drawn from COQUET_LOCK_*. lock_mode is drawn from
+ * COQUET_LMODE_*. Requests to lock will always be from unlocked state,
+ * and vice-versa: direct transitions between shared and exclusive
+ * will never be requested. The VFS may use the lock area for this
+ * purpose. COQUET_FILE_MAIN will be open when called. If wait is true,
+ * function will wait, otherwise return COQUET_RET_LOCKED
+ */
+ int (*lock)(void * vfs_data, int which_lock, int lock_mode,
+ bool_t wait);
+
+ /* Open the given file. which_file is drawn from COQUET_FILE_*.
+ * If allow_create is called, the file is created if not present.
+ * The file is guaranteed not to be open when this function is called.
+ */
+ int (*open)(void * vfs_data, int which_file, bool_t allow_create);
+
+ /* Close the given file. Which_file is drawn from COQUET_FILE_*. The
+ * file is guaranteed to be open when this function is called. If file
+ * is COQUET_FILE_MAIN, all locks must be dropped.
+ */
+ int (*close)(void * vfs_data, int which_file);
+
+ /* Write given data of given length at offset given to indicated file.
+ * which file must be drawn from a COQUET_FILE_* constant. If the
+ * offset is beyond the end of the file, the file should be extended
+ * to accommodate the write.
+ */
+ int (*write)(void * vfs_data, int which_file, char * data,
+ off_t offset, uint64_t length);
+
+ /* Read given data of given length at offset from to indicated file.
+ * which file must be drawn from a COQUET_FILE_* constant. If the
+ * offset is beyond the end of the file, all zeroes must be returned.
+ * File will be open before call.
+ */
+ int (*read)(void * vfs_data, int which_file, char * data,
+ uint64_t offset, uint64_t length);
+
+ /* Finish VFS layer. Release OS resources (files etc), and free the
+ * passed vfs_data.
+ */
+ int (*finish)(void *vfs_data);
+
+} vfs_t;
+
+#endif