* ident covers only env vars specified with -E
* ident covers only arguments interpreter and (if present) script
*/
+
/*
- * Process structure:
- * client (C wrapper) connects to server
- * (including reading ack byte)
- * if fails or garbage
- * === acquire lock ===
- * makes new listening socket
- * makes first-instance socketpair
- * forks setup (script, sock fds indicated in env)
- * fd0, fd1, fd2: from-outer
- * other fd: call(client-end)(fake)
- * reaps setup (and reports error)
- * (implicitly releases lock)
- *
- * setup (pre-exec) fd0: null,
- * fd[12]: fd2-from-outer
- * env fds: listener, call(server-end)(fake)
- * close fd: lockfile
- * possibly clean env, argv
- *
- * setup (script) runs initialisation parts of the script
- * at prefork establishment point:
- * setup (pm) [1] opens syslog
- * forks for server
- * [2] exits
- *
- * server (pm) [1] [fd0: null],
- * [fd[12]: fd2-from-outer]
- * right away, forks init monitor
- * [2] closes outer caller fds and call(fake)
- * [server (pm)] fd[012]: null
- * other fds: listener, syslog
- * runs in loop accepting and forking,
- * reaping and limiting children (incl init monitor)
- * reports failures of monitors to syslog
- *
- * [client (C wrapper)] if client connect succeeds:
- * now fd: call(client-end)
- * sends message with: cmdline, env
- * sends fds
- *
- * [server (script)] accepts, forks subseq monitor
- *
- * monitor [1] [fd0: null]
- * (init [fd[12]: init: fd2-from-outer; subseq: null]
- * or errors: init: fd2; subseq: syslog
- * subseq) other fds: syslog, call(server-end)
- * sends ack byte
- * receives args, env, fds
- * forks executor
- *
- * executor sorts out fds:
- * fd0, fd1, fd2: from-outer
- * close fds: call(server-end)
- * retained fds: syslog
- *
- * sets cmdline, env
- * runs main part of script
- * exits normally
- *
- * [monitor] [fd[012]: null]
- * [fd[12]: init: fd2-from-outer; subseq: null]
- * [errors: init: fd2; subseq: syslog]
- * reaps executor
- * reports status via socket
- *
- * [client (C wrapper)] [fd0, fd1, fd2: from-outer]
- * [other fd: call(client-end)]
- * receives status, exits appropriately
- * (if was bad signal, reports to stderr, exits 127)
- */
+***************************************************************************
+\f
+ State during service execution, process parentage and key fds
+
+ CALLER
+ ||
+ ||
+ || listen watch-err/in
+ || call (accept) \ ,------2
+ || ,-----------------------------. SERVER -----0 WATCHER(C)
+ CLIENT 2--=fdpassed>=---------. \ || && | &&
+ (C) 1--=fdpassed>=---------. \ \ || inotify
+ 0--=fdpassed>=---------. \ \ \ || sockpath
+ \ \ \ \ ||
+ | | |\ | ||
+ | | | \ | ||
+ | \ | \ \ ||
+ \ \ \ \ MONITOR &
+ \ \ \ `12 || |
+ \ \ \ || |
+ \ \ \ || |execterm
+ \ \ \ || |
+ \ \ \ || |
+ \ \ 2 || |
+ \ 1 EXECUTOR
+ 0
+ ---- pipes, sockets
+ 012 descriptors
+ -==- fds shared
+ || process parentage
+ && session leader (daemon)
+ & process group leader
+
+***************************************************************************
+\f
+ Control flow and causality
+
+ CALLER
+ |
+ |fork/exec
+ |
+ CLIENT
+ |
+ attempt to connect, and read greeting
+ |failure? \success?
+ | \
+ tidy up stale /run entries *1 (continue from send_fds, below)
+ acquire lock
+ |
+ retry attempt to connect, and read greeting
+ |failure? \success?
+ | \
+ create listening socket release lock
+ | \
+ fork/daemonise *1
+ | `------------------.
+ | WATCHER(C) &&
+ |
+ make "fake" initial call socketpair (C)
+ | prefork-interp
+ fork/exec #########################################################
+ | `-------------. application
+ | # SCRIPT (setup)
+ | # |
+ | # script initialisation
+ | # | application
+ | # ########|#############################################
+ | # | prefork-interp
+ | # identify fds from envirnment (Perl)
+ | # open syslog
+ | # |
+ | # dzemonize
+ | ,.....<....../ |
+ waitpid # fork for initial service
+ | # |child? |parent?
+ | # | |
+ | # | SCRIPT [server] &&
+ | # | |
+ | # | ** accept / event loop **
+ | # | accepted? \ \ \
+ | # | / \ watch\ \idle
+ | # | fork child \stderr\ \timeout?
+ | # | _________/ | | |
+ | # |/ |read? | |
+ | # SCRIPT [monitor] | eof?| |
+ | # setpgrpt & | | |
+ | # | log msg | |
+ read ,....<.....send greeting | | |
+ greeting # | ___________________
+ | # |
+ release # |
+ lock *1 # |
+ | / # |
+ send fds.....>.... |
+ | # \receive fds
+ | # |
+ | # fork for executor (Perl)
+ | # |parent? \child? prefork-interp
+ | # | ######\############################
+ | # | # SCRIPT (executor) application
+ | # | # execute service
+ | # wait for read # |
+ | # (select) # terminates
+ | # | | # |
+ | # | # kernel closes execterm
+ | # | ,......<....../|
+ | # execterm? # |
+ | # | # zombie
+ | # | | ,......<...../
+ | # waitpid # _______________
+ | # | #
+ | ,....<....,..send status #
+ read status # ________________ #
+ _____________ #
+
+
+ ********** Or, if client is killed **********
+
+ | # | # execute service
+ terminates # wait for read # |
+ | # (select) # |
+ kernel # | | # |
+ closes call # | # |
+ \..>......_ | # |
+ _____________ # \|call? # |
+ # | # |
+ # kill whole pgrp... # killled
+ # | # zombie
+ # | | ,......<....../
+ # waitpid # _______________
+ # | #
+ # send exit status #
+ # _____SIGPIPE______ #
+
+ | - \ / process control flow
+ ... < > causes mediated by fds or other IPC etc.
+ && session leader (daemon)
+ & process group leader
+ # language/implementation boundary
+ *1 line continued elsewhere
+ event? condition
+ ______ process termination (after reaping, if shown)
+
+***************************************************************************
+\f
+ Sequence of events and fd pluming.
+ NB INCOMPLETE - does not cover execterm, cleanup
+
+ client (C wrapper) connects to server
+ (including reading ack byte)
+ if fails or garbage
+ === acquires lock ===
+ makes new listening socket
+ makes watcher pipes
+ forks watcher and awaits
+ makes first-instance socketpair
+ forks setup (script, sock fds indicated in env)
+ fd0, fd1, fd2: from-outer
+ other fd: call(client-end)(fake)
+ reaps setup (and reports error)
+ (implicitly releases lock)
+
+ watcher fd[012]: watcher pipes
+ starts watch on socket path
+ sets stderr to line buffered
+ sets stdin to nonblocking
+ daemonises (one fork, becomes session leader)
+ when socket stat changes, quit
+
+ setup (pre-exec) fd0: null,
+ fd[12]: fd2-from-outer
+ env fds: listener, call(server-end)(fake),
+ watcher read, watcher write
+ close fd: lockfile
+ possibly clean env, argv
+
+ setup (script) runs initialisation parts of the script
+ at prefork establishment point:
+ setup (pm) [1] opens syslog
+ forks for server
+ [2] exits
+
+ server (pm) [1] [fd0: null],
+ [fd[12]: fd2-from-outer]
+ setsid
+ right away, forks init monitor
+ [2] closes outer caller fds and call(fake)
+ [server (pm)] fd[012]: null
+ other fds: listener, syslog
+ runs in loop accepting and forking,
+ reaping and limiting children (incl init monitor)
+ reports failures of monitors to syslog
+
+ [client (C wrapper)] if client connect succeeds:
+ now fd: call(client-end)
+ sends message with: cmdline, env
+ sends fds
+
+ [server (script)] accepts, forks subseq monitor
+
+ monitor [1] [fd0: null]
+ (init [fd[12]: init: fd2-from-outer; subseq: null]
+ or errors: init: fd2; subseq: syslog
+ subseq) other fds: syslog, call(server-end)
+ sends ack byte
+ receives args, env, fds
+ forks executor
+
+ executor sorts out fds:
+ fd0, fd1, fd2: from-outer
+ close fds: call(server-end)
+ retained fds: syslog
+
+ sets cmdline, env
+ runs main part of script
+ exits normally
+
+ [monitor] [fd[012]: null]
+ [fd[12]: init: fd2-from-outer; subseq: null]
+ [errors: init: fd2; subseq: syslog]
+ reaps executor
+ reports status via socket
+
+ [client (C wrapper)] [fd0, fd1, fd2: from-outer]
+ [other fd: call(client-end)]
+ receives status, exits appropriately
+ (if was bad signal, reports to stderr, exits 127)
+
+***************************************************************************
+\f
+*/
#include <arpa/inet.h>
+#include <uv.h>
+
#include "prefork.h"
const char our_name[] = "prefork-interp";
}
static int laundering;
+static int mode;
+static int max_sockets = 100; // maximum entries in the run dir is 2x this
+
+static struct stat initial_stab;
+
+#define MODE_NORMAL 0
+#define MODE_KILL 'k'
+#define MODE_FRESH 'f'
const struct cmdinfo cmdinfos[]= {
PREFORK_CMDINFOS
- { 0, 'U', 0, .iassignto= &laundering, .arg= 'U' },
+ { 0, 'U', 0, .iassignto= &laundering, .arg= 'U' },
+ { "kill", 0, 0, .iassignto= &mode, .arg= MODE_KILL },
+ { 0, 'f', 0, .iassignto= &mode, .arg= MODE_FRESH },
{ 0 }
};
r = sigaction(sig, &sa, 0);
if (r) diee("failed to reset signal handler while propagating %s",
signame);
-
+
sigset_t sset;
sigemptyset(&sset);
sigaddset(&sset, sig);
die("%s failed with weird wait status %d 0x%x", what, status, status);
}
+typedef struct {
+ char *name_hash;
+ time_t atime;
+} PrecleanEntry;
+
+static int preclean_entry_compar_name(const void *av, const void *bv) {
+ const PrecleanEntry *a = av;
+ const PrecleanEntry *b = bv;
+ return strcmp(a->name_hash, b->name_hash);
+}
+
+static int preclean_entry_compar_atime(const void *av, const void *bv) {
+ const PrecleanEntry *ae = av; time_t a = ae->atime;
+ const PrecleanEntry *be = bv; time_t b = be->atime;
+ return (a > b ? +1 :
+ a < b ? -1 : 0);
+}
+
+static time_t preclean_stat_atime(const char *s_path) {
+ struct stat stab;
+ int r= lstat(s_path, &stab);
+ if (r) {
+ if (errno!=ENOENT) diee("pre-cleanup: stat socket (%s)", s_path);
+ return 0;
+ }
+ return stab.st_atime;
+}
+
+static void preclean(void) {
+ DIR *dir = opendir(run_base);
+ if (!dir) {
+ if (errno == ENOENT) return;
+ diee("pre-cleanup: open run dir (%s)", run_base);
+ }
+
+ PrecleanEntry *entries=0;
+ size_t avail_entries=0;
+ size_t used_entries=0;
+
+ struct dirent *de;
+ while ((errno = 0, de = readdir(dir))) {
+ char c0 = de->d_name[0];
+ if (!(c0 == 'l' || c0 == 's')) continue;
+ char *name_hash = m_asprintf("%s", de->d_name+1);
+ char *s_path = m_asprintf("%s/s%s", run_base, name_hash);
+ time_t atime = preclean_stat_atime(s_path);
+
+ if (avail_entries == used_entries) {
+ assert(avail_entries < INT_MAX / 4 / sizeof(PrecleanEntry));
+ avail_entries <<= 1;
+ avail_entries += 10;
+ entries = realloc(entries, avail_entries * sizeof(PrecleanEntry));
+ }
+ entries[used_entries].name_hash = name_hash;
+ entries[used_entries].atime = atime;
+ used_entries++;
+ }
+ if (errno) diee("pre-cleanup: read run dir (%s)", run_base);
+
+ // First we dedupe (after sorting by path)
+ qsort(entries, used_entries, sizeof(PrecleanEntry),
+ preclean_entry_compar_name);
+ PrecleanEntry *p, *q;
+ for (p=entries, q=entries; p < entries + used_entries; p++) {
+ if (q > entries && !strcmp(p->name_hash, (q-1)->name_hash))
+ continue;
+ *q++ = *p;
+ }
+ used_entries = q - entries;
+
+ // Now maybe delete some things
+ //
+ // Actually this has an off-by-one error since we are about
+ // to create a socket, so the actual number of sockets is one more.
+ // But, *actually*, since there might be multiple of us running at once,
+ // we might have even more than that. This doesn't really matter.
+ if (used_entries > max_sockets) {
+ qsort(entries, used_entries, sizeof(PrecleanEntry),
+ preclean_entry_compar_atime);
+ for (p=entries; p < entries + max_sockets; p++) {
+ char *l_path = m_asprintf("%s/l%s", run_base, p->name_hash);
+ char *s_path = m_asprintf("%s/s%s", run_base, p->name_hash);
+ int lock_fd = flock_file(l_path);
+ // Recheck atime - we might have raced!
+ time_t atime = preclean_stat_atime(s_path);
+ if (atime != p->atime) {
+ // Raced. This will leave use deleting too few things. Whatever.
+ } else {
+ int r= unlink(s_path);
+ if (r && errno!=ENOENT) diee("preclean: delete stale (%s)", s_path);
+ r= unlink(l_path);
+ if (r) diee("preclean: delete stale lock (%s)", s_path);
+ // NB we don't hold the lock any more now.
+ }
+ close(lock_fd);
+ free(l_path);
+ free(s_path);
+ }
+ }
+
+ for (p=entries; p < entries + used_entries; p++)
+ free(p->name_hash);
+ free(entries);
+}
+
static __attribute((noreturn)) void die_data_overflow(void) {
die("cannot handle data with length >2^32");
}
*buf += dl;
}
}
-
+
static void prepare_length(size_t *len, char **buf, size_t dl_sz) {
if (dl_sz > UINT32_MAX) die_data_overflow();
uint32_t dl = htonl(dl_sz);
send_fd(1);
send_fd(2);
- size_t len = 4;
+ size_t len = 0;
prepare_message(&len, 0);
- char *m = malloc(len);
- if (!m) diee("failed to allocate for message");
+
+ size_t tlen = len + 4;
+ char *m = xmalloc(tlen);
char *p = m;
- prepare_length(0, &p, len - 4);
+ prepare_length(0, &p, len);
prepare_message(0, &p);
- assert(p == m + len);
+ assert(p == m + tlen);
+
+ ssize_t sr = fwrite(m, tlen, 1, call_sock);
+ if (sr != 1) diee("write request (buffer)");
- ssize_t sr = fwrite(p, len, 1, call_sock);
- if (sr != 1) diee("write request");
+ if (fflush(call_sock)) diee("write request");
}
static FILE *call_sock_from_fd(int fd) {
// Returns -1 on EOF
static int protocol_read_maybe(void *data, size_t sz) {
+ if (!sz) return 0;
size_t sr = fread(data, sz, 1, call_sock);
if (sr != 1) {
if (was_eof(call_sock)) return -1;
- diee("read() on monitor call socket");
+ diee("read() on monitor call socket (%zd)", sz);
}
return 0;
}
int r;
int fd = -1;
+ if (mode != MODE_NORMAL) return 0;
+
fd = socket(AF_UNIX, SOCK_STREAM, 0);
if (fd==-1) diee("socket() for client");
return 0;
}
+static void watcher_cb_stdin(uv_poll_t *handle, int status, int events) {
+ char c;
+ int r;
+
+ if ((errno = -status)) diee("watcher: poll stdin");
+ for (;;) {
+ r= read(0, &c, 1);
+ if (r!=-1) _exit(0);
+ if (!(errno==EINTR || errno==EWOULDBLOCK || errno==EAGAIN))
+ diee("watcher: read sentinel stdin");
+ }
+}
+
+static void watcher_cb_sockpath(uv_fs_event_t *handle, const char *filename,
+ int events, int status) {
+ int r;
+ struct stat now_stab;
+
+ if ((errno = -status)) diee("watcher: poll stdin");
+ for (;;) {
+ r= stat(socket_path, &now_stab);
+ if (r==-1) {
+ if (errno==ENOENT) _exit(0);
+ if (errno==EINTR) continue;
+ diee("stat socket: %s", socket_path);
+ }
+ if (!stabs_same_inode(&now_stab, &initial_stab))
+ _exit(0);
+ }
+}
+
+// On entry, stderr is still inherited, but 0 and 1 are the pipes
+static __attribute__((noreturn))
+void become_watcher(void) {
+ uv_loop_t loop;
+ uv_poll_t uvhandle_stdin;
+ uv_fs_event_t uvhandle_sockpath;
+ int r;
+
+ nonblock(0);
+
+ errno= -uv_loop_init(&loop);
+ if (errno) diee("watcher: uv_loop_init");
+
+ errno= -uv_poll_init(&loop, &uvhandle_stdin, 0);
+ if (errno) diee("watcher: uv_poll_init");
+ errno= -uv_poll_start(&uvhandle_stdin,
+ UV_READABLE | UV_WRITABLE | UV_DISCONNECT,
+ watcher_cb_stdin);
+ if (errno) diee("watcher: uv_poll_start");
+
+ errno= -uv_fs_event_init(&loop, &uvhandle_sockpath);
+ if (errno) diee("watcher: uv_fs_event_init");
+
+ errno= -uv_fs_event_start(&uvhandle_sockpath, watcher_cb_sockpath,
+ socket_path, 0);
+ if (errno) diee("watcher: uv_fs_event_start");
+
+ // OK everything is set up, let us daemonise
+ if (dup2(1,2) != 2) diee("watcher: set daemonised stderr");
+ r= setvbuf(stderr, 0, _IOLBF, BUFSIZ);
+ if (r) diee("watcher: setvbuf stderr");
+
+ pid_t child = fork();
+ if (child == (pid_t)-1) diee("watcher: fork");
+ if (child) _exit(0);
+
+ if (setsid() == (pid_t)-1) diee("watcher: setsid");
+
+ r= uv_run(&loop, UV_RUN_DEFAULT);
+ die("uv_run returned (%d)", r);
+}
+
static __attribute__((noreturn))
-void become_setup(int sfd, int fake_pair[2]) {
+void become_setup(int sfd, int lockfd, int fake_pair[2],
+ int watcher_stdin, int watcher_stderr) {
+ close(lockfd);
close(fake_pair[0]);
int call_fd = fake_pair[1];
int null_0 = open("/dev/null", O_RDONLY); if (null_0 < 0) diee("open null");
if (dup2(null_0, 0)) diee("dup2 /dev/null onto stdin");
+ close(null_0);
if (dup2(2, 1) != 1) die("dup2 stderr onto stdout");
+ nonblock(sfd);
+
// Extension could work like this:
//
// We advertise a new protocol (perhaps one which is nearly entirely
// different after the connect) by putting a name for it comma-separated
// next to "v1". Simple extension can be done by having the script
// side say something about it in the ack xdata, which we currently ignore.
- putenv(m_asprintf("PREFORK_INTERP=v1 %d,%d %s",
- sfd, call_fd, socket_path));
+ putenv(m_asprintf("PREFORK_INTERP=v1 %d,%d,%d,%d",
+ sfd, call_fd, watcher_stdin, watcher_stderr));
execvp(executor_argv[0], (char**)executor_argv);
diee("execute %s", executor_argv[0]);
call_sock = connect_existing();
if (call_sock) return;
+ // We're going to make a new one, so clean out old ones
+ preclean();
+
int lockfd = acquire_lock();
+
+ if (mode == MODE_KILL) {
+ r= unlink(socket_path);
+ if (r && errno != ENOENT) diee("remove socket %s", socket_path);
+
+ r= unlink(lock_path);
+ if (r) diee("rmeove lock %s", lock_path);
+ _exit(0);
+ }
+
call_sock = connect_existing();
if (call_sock) { close(lockfd); return; }
if (r<0 && errno!=ENOENT)
diee("failed to remove stale socket %s", socket_path);
- int fake_pair[2];
- r = socketpair(AF_UNIX, SOCK_STREAM, 0, fake_pair);
- if (r<0) diee("socketpair() for fake initial connection");
-
int sfd = socket(AF_UNIX, SOCK_STREAM, 0);
if (sfd<0) diee("socket() for new listener");
r= bind(sfd, (const struct sockaddr*)&sockaddr_sun, salen);
if (r<0) diee("bind() on new listener");
+ r= stat(socket_path, &initial_stab);
+ if (r<0) diee("stat() fresh socket");
+
// We never want callers to get ECONNREFUSED. But:
// There is a race here: from my RTFM they may get ECONNREFUSED
// if they try between our bind() and listen(). But if they do, they'll
r = listen(sfd, INT_MAX);
if (r<0) diee("listen() for new listener");
+ // Fork watcher
+
+ int watcher_stdin[2];
+ int watcher_stderr[2];
+ if (pipe(watcher_stdin) || pipe(watcher_stderr))
+ diee("pipe() for socket inode watcher");
+
+ pid_t watcher = fork();
+ if (watcher == (pid_t)-1) diee("fork for watcher");
+ if (!watcher) {
+ close(sfd);
+ close(lockfd);
+ close(watcher_stdin[1]);
+ close(watcher_stderr[0]);
+ if (dup2(watcher_stdin[0], 0) != 0 ||
+ dup2(watcher_stderr[1], 1) != 1)
+ diee("initial dup2() for watcher");
+ close(watcher_stdin[0]);
+ close(watcher_stderr[1]);
+ become_watcher();
+ }
+
+ close(watcher_stdin[0]);
+ close(watcher_stderr[1]);
+ nonblock(watcher_stderr[0]);
+
+ // Fork setup
+
+ int fake_pair[2];
+ r = socketpair(AF_UNIX, SOCK_STREAM, 0, fake_pair);
+ if (r<0) diee("socketpair() for fake initial connection");
+
pid_t setup_pid = fork();
if (setup_pid == (pid_t)-1) diee("fork for spawn setup");
- if (!setup_pid) become_setup(sfd, fake_pair);
+ if (!setup_pid) become_setup(sfd, lockfd, fake_pair,
+ watcher_stdin[1], watcher_stderr[0]);
close(fake_pair[1]);
close(sfd);
EACH_NEW_ARG( *out++ = arg; );
*out++ = 0;
-}
+}
int main(int argc_unused, const char *const *argv) {
process_opts(&argv);