2 * "Interpreter" that you can put in #! like this
3 * #!/usr/bin/prefork-interp [<options>] <interpreter>
6 * prefork-interp [<option> ..] <interpreter> [<script> [<args> ...]]
7 * prefork-interp [<option>,..],<interpreter> <script> [<args> ...]
8 * prefork-interp '[<option> ..] <interpreter>' <script> [<args> ...]
10 * Options must specify argument laundering mode.
11 * Currently the only mode supported is:
12 * -U unlaundered: setup and executor both get all arguments and env vars
13 * ident covers only env vars specified with -E
14 * ident covers only arguments interpreter and (if present) script
18 ***************************************************************************
20 State during service execution, process parentage and key fds
25 || listen watch-err/in
26 || call (accept) \ ,------2
27 || ,-----------------------------. SERVER -----0 WATCHER(C)
28 CLIENT 2--=fdpassed>=---------. \ || && | &&
29 (C) 1--=fdpassed>=---------. \ \ || inotify
30 0--=fdpassed>=---------. \ \ \ || sockpath
48 && session leader (daemon)
49 & process group leader
51 ***************************************************************************
53 Control flow and causality
61 attempt to connect, and read greeting
64 tidy up stale /run entries *1 (continue from send_fds, below)
67 retry attempt to connect, and read greeting
70 create listening socket release lock
73 | `------------------.
76 make "fake" initial call socketpair (C)
78 fork/exec #########################################################
79 | `-------------. application
82 | # script initialisation
84 | # ########|#############################################
86 | # identify fds from envirnment (Perl)
91 waitpid # fork for initial service
94 | # | SCRIPT [server] &&
96 | # | ** accept / event loop **
98 | # | / \ watch\ \idle
99 | # | fork child \stderr\ \timeout?
100 | # | _________/ | | |
102 | # SCRIPT [monitor] | eof?| |
105 read ,....<.....send greeting | | |
106 greeting # | ___________________
114 | # fork for executor (Perl)
115 | # |parent? \child? prefork-interp
116 | # | ######\############################
117 | # | # SCRIPT (executor) application
118 | # | # execute service
119 | # wait for read # |
120 | # (select) # terminates
122 | # | # kernel closes execterm
123 | # | ,......<....../|
126 | # | | ,......<...../
127 | # waitpid # _______________
129 | ,....<....,..send status #
130 read status # ________________ #
134 ********** Or, if client is killed **********
136 | # | # execute service
137 terminates # wait for read # |
142 _____________ # \|call? # |
144 # kill whole pgrp... # killled
146 # | | ,......<....../
147 # waitpid # _______________
150 # _____SIGPIPE______ #
152 | - \ / process control flow
153 ... < > causes mediated by fds or other IPC etc.
154 && session leader (daemon)
155 & process group leader
156 # language/implementation boundary
157 *1 line continued elsewhere
159 ______ process termination (after reaping, if shown)
161 ***************************************************************************
163 Sequence of events and fd pluming.
164 NB INCOMPLETE - does not cover execterm, cleanup
166 client (C wrapper) connects to server
167 (including reading ack byte)
169 === acquires lock ===
170 makes new listening socket
172 forks watcher and awaits
173 makes first-instance socketpair
174 forks setup (script, sock fds indicated in env)
175 fd0, fd1, fd2: from-outer
176 other fd: call(client-end)(fake)
177 reaps setup (and reports error)
178 (implicitly releases lock)
180 watcher fd[012]: watcher pipes
181 starts watch on socket path
182 sets stderr to line buffered
183 sets stdin to nonblocking
184 daemonises (one fork, becomes session leader)
185 when socket stat changes, quit
187 setup (pre-exec) fd0: null,
188 fd[12]: fd2-from-outer
189 env fds: listener, call(server-end)(fake),
190 watcher read, watcher write
192 possibly clean env, argv
194 setup (script) runs initialisation parts of the script
195 at prefork establishment point:
196 setup (pm) [1] opens syslog
200 server (pm) [1] [fd0: null],
201 [fd[12]: fd2-from-outer]
203 right away, forks init monitor
204 [2] closes outer caller fds and call(fake)
205 [server (pm)] fd[012]: null
206 other fds: listener, syslog
207 runs in loop accepting and forking,
208 reaping and limiting children (incl init monitor)
209 reports failures of monitors to syslog
211 [client (C wrapper)] if client connect succeeds:
212 now fd: call(client-end)
213 sends message with: cmdline, env
216 [server (script)] accepts, forks subseq monitor
218 monitor [1] [fd0: null]
219 (init [fd[12]: init: fd2-from-outer; subseq: null]
220 or errors: init: fd2; subseq: syslog
221 subseq) other fds: syslog, call(server-end)
223 receives args, env, fds
226 executor sorts out fds:
227 fd0, fd1, fd2: from-outer
228 close fds: call(server-end)
232 runs main part of script
235 [monitor] [fd[012]: null]
236 [fd[12]: init: fd2-from-outer; subseq: null]
237 [errors: init: fd2; subseq: syslog]
239 reports status via socket
241 [client (C wrapper)] [fd0, fd1, fd2: from-outer]
242 [other fd: call(client-end)]
243 receives status, exits appropriately
244 (if was bad signal, reports to stderr, exits 127)
246 ***************************************************************************
250 #include <arpa/inet.h>
256 const char our_name[] = "prefork-interp";
258 static struct sockaddr_un sockaddr_sun;
259 static FILE *call_sock;
261 #define ACK_BYTE '\n'
263 static const char *const *executor_argv;
265 static const char header_magic[4] = "PFI\n";
267 void fusagemessage(FILE *f) {
268 fprintf(f, "usage: #!/usr/bin/prefork-interp [<options>]\n");
271 static int laundering;
273 static int max_sockets = 100; // maximum entries in the run dir is 2x this
275 static struct stat initial_stab;
277 #define MODE_NORMAL 0
278 #define MODE_KILL 'k'
279 #define MODE_FRESH 'f'
281 const struct cmdinfo cmdinfos[]= {
283 { 0, 'U', 0, .iassignto= &laundering, .arg= 'U' },
284 { "kill", 0, 0, .iassignto= &mode, .arg= MODE_KILL },
285 { 0, 'f', 0, .iassignto= &mode, .arg= MODE_FRESH },
289 void ident_addinit(void) {
290 char ident_magic[1] = { 0 };
291 sha256_update(&identsc, sizeof(ident_magic), ident_magic);
294 static void propagate_exit_status(int status, const char *what) {
297 if (WIFEXITED(status)) {
301 if (WIFSIGNALED(status)) {
302 int sig = WTERMSIG(status);
303 const char *signame = strsignal(sig);
304 if (signame == 0) signame = "unknown signal";
306 if (! WCOREDUMP(status) &&
313 sa.sa_handler = SIG_DFL;
314 r = sigaction(sig, &sa, 0);
315 if (r) diee("failed to reset signal handler while propagating %s",
320 sigaddset(&sset, sig);
321 r = sigprocmask(SIG_UNBLOCK, &sset, 0);
322 if (r) diee("failed to reset signal block while propagating %s",
326 die("unexpectedly kept running after raising (to propagate) %s",
330 die("%s failed due to signal %d %s%s", what, sig, signame,
331 WCOREDUMP(status) ? " (core dumped)" : "");
334 die("%s failed with weird wait status %d 0x%x", what, status, status);
342 static int preclean_entry_compar_name(const void *av, const void *bv) {
343 const PrecleanEntry *a = av;
344 const PrecleanEntry *b = bv;
345 return strcmp(a->name_hash, b->name_hash);
348 static int preclean_entry_compar_atime(const void *av, const void *bv) {
349 const PrecleanEntry *ae = av; time_t a = ae->atime;
350 const PrecleanEntry *be = bv; time_t b = be->atime;
355 static time_t preclean_stat_atime(const char *s_path) {
357 int r= lstat(s_path, &stab);
359 if (errno!=ENOENT) diee("pre-cleanup: stat socket (%s)", s_path);
362 return stab.st_atime;
365 static void preclean(void) {
366 DIR *dir = opendir(run_base);
368 if (errno == ENOENT) return;
369 diee("pre-cleanup: open run dir (%s)", run_base);
372 PrecleanEntry *entries=0;
373 size_t avail_entries=0;
374 size_t used_entries=0;
377 while ((errno = 0, de = readdir(dir))) {
378 char c0 = de->d_name[0];
379 if (!(c0 == 'l' || c0 == 's')) continue;
380 char *name_hash = m_asprintf("%s", de->d_name+1);
381 char *s_path = m_asprintf("%s/s%s", run_base, name_hash);
382 time_t atime = preclean_stat_atime(s_path);
384 if (avail_entries == used_entries) {
385 assert(avail_entries < INT_MAX / 4 / sizeof(PrecleanEntry));
388 entries = realloc(entries, avail_entries * sizeof(PrecleanEntry));
390 entries[used_entries].name_hash = name_hash;
391 entries[used_entries].atime = atime;
394 if (errno) diee("pre-cleanup: read run dir (%s)", run_base);
396 // First we dedupe (after sorting by path)
397 qsort(entries, used_entries, sizeof(PrecleanEntry),
398 preclean_entry_compar_name);
399 PrecleanEntry *p, *q;
400 for (p=entries, q=entries; p < entries + used_entries; p++) {
401 if (q > entries && !strcmp(p->name_hash, (q-1)->name_hash))
405 used_entries = q - entries;
407 // Now maybe delete some things
409 // Actually this has an off-by-one error since we are about
410 // to create a socket, so the actual number of sockets is one more.
411 // But, *actually*, since there might be multiple of us running at once,
412 // we might have even more than that. This doesn't really matter.
413 if (used_entries > max_sockets) {
414 qsort(entries, used_entries, sizeof(PrecleanEntry),
415 preclean_entry_compar_atime);
416 for (p=entries; p < entries + max_sockets; p++) {
417 char *l_path = m_asprintf("%s/l%s", run_base, p->name_hash);
418 char *s_path = m_asprintf("%s/s%s", run_base, p->name_hash);
419 int lock_fd = flock_file(l_path);
420 // Recheck atime - we might have raced!
421 time_t atime = preclean_stat_atime(s_path);
422 if (atime != p->atime) {
423 // Raced. This will leave use deleting too few things. Whatever.
425 int r= unlink(s_path);
426 if (r && errno!=ENOENT) diee("preclean: delete stale (%s)", s_path);
428 if (r) diee("preclean: delete stale lock (%s)", s_path);
429 // NB we don't hold the lock any more now.
437 for (p=entries; p < entries + used_entries; p++)
442 static __attribute((noreturn)) void die_data_overflow(void) {
443 die("cannot handle data with length >2^32");
446 static void prepare_data(size_t *len, char **buf,
447 const void *data, size_t dl) {
449 if (dl >= SIZE_MAX - *len)
454 memcpy(*buf, data, dl);
459 static void prepare_length(size_t *len, char **buf, size_t dl_sz) {
460 if (dl_sz > UINT32_MAX) die_data_overflow();
461 uint32_t dl = htonl(dl_sz);
462 prepare_data(len, buf, &dl, sizeof(dl));
465 static void prepare_string(size_t *len, char **buf, const char *s) {
466 size_t sl = strlen(s);
467 prepare_data(len, buf, s, sl+1);
470 static void prepare_message(size_t *len, char **buf) {
473 const char *const *p = (void*)environ;
476 prepare_string(len, buf, s);
479 prepare_string(len, buf, "");
483 prepare_string(len, buf, s);
486 static void send_fd(int payload_fd) {
487 int via_fd = fileno(call_sock);
490 struct cmsghdr align;
491 char buf[CMSG_SPACE(sizeof(payload_fd))];
501 iov.iov_base = &dummy_byte;
507 msg.msg_control = cmsg_buf.buf;
508 msg.msg_controllen = sizeof(cmsg_buf.buf);
510 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
511 cmsg->cmsg_level = SOL_SOCKET;
512 cmsg->cmsg_type = SCM_RIGHTS;
513 cmsg->cmsg_len = CMSG_LEN(sizeof(payload_fd));
514 *(int*)CMSG_DATA(cmsg) = payload_fd;
516 msg.msg_controllen = sizeof(cmsg_buf.buf);
519 ssize_t r = sendmsg(via_fd, &msg, 0);
521 if (errno == EINTR) continue;
529 static void send_request(void) {
530 // Sending these first makes it easier for the script to
531 // use buffered IO for the message.
537 prepare_message(&len, 0);
539 size_t tlen = len + 4;
540 char *m = xmalloc(tlen);
542 prepare_length(0, &p, len);
543 prepare_message(0, &p);
544 assert(p == m + tlen);
546 ssize_t sr = fwrite(m, tlen, 1, call_sock);
547 if (sr != 1) diee("write request (buffer)");
549 if (fflush(call_sock)) diee("write request");
552 static FILE *call_sock_from_fd(int fd) {
555 FILE *call_sock = fdopen(fd, "r+");
556 if (!call_sock) diee("fdopen socket");
558 r = setvbuf(call_sock, 0, _IONBF, 0);
559 if (r) die("setvbuf socket");
564 static bool was_eof(FILE *call_sock) {
565 return feof(call_sock) || errno==ECONNRESET;
569 static int protocol_read_maybe(void *data, size_t sz) {
571 size_t sr = fread(data, sz, 1, call_sock);
573 if (was_eof(call_sock)) return -1;
574 diee("read() on monitor call socket (%zd)", sz);
579 static void protocol_read(void *data, size_t sz) {
580 if (protocol_read_maybe(data, sz) < 0)
581 die("monitor process quit unexpectedly");
584 // Returns 0 if OK, error msg if peer was garbage.
585 static const char *read_greeting(void) {
586 char got_magic[sizeof(header_magic)];
588 if (protocol_read_maybe(&got_magic, sizeof(got_magic)) < 0)
589 return "initial monitor process quit";
591 if (memcmp(got_magic, header_magic, sizeof(header_magic)))
592 die("got unexpected protocol magic 0x%02x%02x%02x%02x",
593 got_magic[0], got_magic[1], got_magic[2], got_magic[3]);
596 protocol_read(&xdata_len, sizeof(xdata_len));
597 void *xdata = xmalloc(xdata_len);
598 protocol_read(xdata, xdata_len);
603 // Returns: call(client-end), or 0 to mean "is garbage"
604 // find_socket_path must have been called
605 static FILE *connect_existing(void) {
609 if (mode != MODE_NORMAL) return 0;
611 fd = socket(AF_UNIX, SOCK_STREAM, 0);
612 if (fd==-1) diee("socket() for client");
614 socklen_t salen = sizeof(sockaddr_sun);
615 r = connect(fd, (const struct sockaddr*)&sockaddr_sun, salen);
617 if (errno==ECONNREFUSED || errno==ENOENT) goto x_garbage;
618 diee("connect() %s", socket_path);
621 call_sock = call_sock_from_fd(fd);
630 if (call_sock) { fclose(call_sock); call_sock=0; }
631 if (fd >= 0) close(fd);
635 static void watcher_cb_stdin(uv_poll_t *handle, int status, int events) {
639 if ((errno = -status)) diee("watcher: poll stdin");
643 if (!(errno==EINTR || errno==EWOULDBLOCK || errno==EAGAIN))
644 diee("watcher: read sentinel stdin");
648 static void watcher_cb_sockpath(uv_fs_event_t *handle, const char *filename,
649 int events, int status) {
651 struct stat now_stab;
653 if ((errno = -status)) diee("watcher: poll stdin");
655 r= stat(socket_path, &now_stab);
657 if (errno==ENOENT) _exit(0);
658 if (errno==EINTR) continue;
659 diee("stat socket: %s", socket_path);
661 if (!stabs_same_inode(&now_stab, &initial_stab))
666 // On entry, stderr is still inherited, but 0 and 1 are the pipes
667 static __attribute__((noreturn))
668 void become_watcher(void) {
670 uv_poll_t uvhandle_stdin;
671 uv_fs_event_t uvhandle_sockpath;
676 errno= -uv_loop_init(&loop);
677 if (errno) diee("watcher: uv_loop_init");
679 errno= -uv_poll_init(&loop, &uvhandle_stdin, 0);
680 if (errno) diee("watcher: uv_poll_init");
681 errno= -uv_poll_start(&uvhandle_stdin,
682 UV_READABLE | UV_WRITABLE | UV_DISCONNECT,
684 if (errno) diee("watcher: uv_poll_start");
686 errno= -uv_fs_event_init(&loop, &uvhandle_sockpath);
687 if (errno) diee("watcher: uv_fs_event_init");
689 errno= -uv_fs_event_start(&uvhandle_sockpath, watcher_cb_sockpath,
691 if (errno) diee("watcher: uv_fs_event_start");
693 // OK everything is set up, let us daemonise
694 if (dup2(1,2) != 2) diee("watcher: set daemonised stderr");
695 r= setvbuf(stderr, 0, _IOLBF, BUFSIZ);
696 if (r) diee("watcher: setvbuf stderr");
698 pid_t child = fork();
699 if (child == (pid_t)-1) diee("watcher: fork");
702 if (setsid() == (pid_t)-1) diee("watcher: setsid");
704 r= uv_run(&loop, UV_RUN_DEFAULT);
705 die("uv_run returned (%d)", r);
708 static __attribute__((noreturn))
709 void become_setup(int sfd, int lockfd, int fake_pair[2],
710 int watcher_stdin, int watcher_stderr) {
713 int call_fd = fake_pair[1];
715 int null_0 = open("/dev/null", O_RDONLY); if (null_0 < 0) diee("open null");
716 if (dup2(null_0, 0)) diee("dup2 /dev/null onto stdin");
718 if (dup2(2, 1) != 1) die("dup2 stderr onto stdout");
722 // Extension could work like this:
724 // We advertise a new protocol (perhaps one which is nearly entirely
725 // different after the connect) by putting a name for it comma-separated
726 // next to "v1". Simple extension can be done by having the script
727 // side say something about it in the ack xdata, which we currently ignore.
728 putenv(m_asprintf("PREFORK_INTERP=v1 %d,%d,%d,%d",
729 sfd, call_fd, watcher_stdin, watcher_stderr));
731 execvp(executor_argv[0], (char**)executor_argv);
732 diee("execute %s", executor_argv[0]);
735 static void connect_or_spawn(void) {
738 call_sock = connect_existing();
739 if (call_sock) return;
741 // We're going to make a new one, so clean out old ones
744 int lockfd = acquire_lock();
746 if (mode == MODE_KILL) {
747 r= unlink(socket_path);
748 if (r && errno != ENOENT) diee("remove socket %s", socket_path);
750 r= unlink(lock_path);
751 if (r) diee("rmeove lock %s", lock_path);
755 call_sock = connect_existing();
756 if (call_sock) { close(lockfd); return; }
758 // We must start a fresh one, and we hold the lock
760 r = unlink(socket_path);
761 if (r<0 && errno!=ENOENT)
762 diee("failed to remove stale socket %s", socket_path);
764 int sfd = socket(AF_UNIX, SOCK_STREAM, 0);
765 if (sfd<0) diee("socket() for new listener");
767 socklen_t salen = sizeof(sockaddr_sun);
768 r= bind(sfd, (const struct sockaddr*)&sockaddr_sun, salen);
769 if (r<0) diee("bind() on new listener");
771 r= stat(socket_path, &initial_stab);
772 if (r<0) diee("stat() fresh socket");
774 // We never want callers to get ECONNREFUSED. But:
775 // There is a race here: from my RTFM they may get ECONNREFUSED
776 // if they try between our bind() and listen(). But if they do, they'll
777 // acquire the lock (serialising with us) and retry, and then it will work.
778 r = listen(sfd, INT_MAX);
779 if (r<0) diee("listen() for new listener");
783 int watcher_stdin[2];
784 int watcher_stderr[2];
785 if (pipe(watcher_stdin) || pipe(watcher_stderr))
786 diee("pipe() for socket inode watcher");
788 pid_t watcher = fork();
789 if (watcher == (pid_t)-1) diee("fork for watcher");
793 close(watcher_stdin[1]);
794 close(watcher_stderr[0]);
795 if (dup2(watcher_stdin[0], 0) != 0 ||
796 dup2(watcher_stderr[1], 1) != 1)
797 diee("initial dup2() for watcher");
798 close(watcher_stdin[0]);
799 close(watcher_stderr[1]);
803 close(watcher_stdin[0]);
804 close(watcher_stderr[1]);
805 nonblock(watcher_stderr[0]);
810 r = socketpair(AF_UNIX, SOCK_STREAM, 0, fake_pair);
811 if (r<0) diee("socketpair() for fake initial connection");
813 pid_t setup_pid = fork();
814 if (setup_pid == (pid_t)-1) diee("fork for spawn setup");
815 if (!setup_pid) become_setup(sfd, lockfd, fake_pair,
816 watcher_stdin[1], watcher_stderr[0]);
820 call_sock = call_sock_from_fd(fake_pair[0]);
823 pid_t got = waitpid(setup_pid, &status, 0);
824 if (got == (pid_t)-1) diee("waitpid setup [%ld]", (long)setup_pid);
825 if (got != setup_pid) diee("waitpid setup [%ld] gave [%ld]!",
826 (long)setup_pid, (long)got);
827 if (status != 0) propagate_exit_status(status, "setup");
829 const char *emsg = read_greeting();
830 if (emsg) die("setup failed: %s", emsg);
836 static void make_executor_argv(const char *const *argv) {
837 switch (laundering) {
839 default: die("need -U (specifying unlaundered argument handling)");
843 #define EACH_NEW_ARG(EACH) { \
844 arg = interp; { EACH } \
845 if ((arg = script)) { EACH } \
846 const char *const *walk = argv; \
847 while ((arg = *walk++)) { EACH } \
851 EACH_NEW_ARG( (void)arg; count++; );
853 const char **out = calloc(count, sizeof(char*));
854 executor_argv = (const char* const*)out;
855 if (!executor_argv) diee("allocate for arguments");
857 EACH_NEW_ARG( *out++ = arg; );
861 int main(int argc_unused, const char *const *argv) {
868 // which ought to be passed on to the actual executor.
869 make_executor_argv(argv);
872 FILLZERO(sockaddr_sun);
873 sockaddr_sun.sun_family = AF_UNIX;
874 assert(strlen(socket_path) <= sizeof(sockaddr_sun.sun_path));
875 strncpy(sockaddr_sun.sun_path, socket_path, sizeof(sockaddr_sun.sun_path));
879 // We're committed now, send the request (or bail out)
883 protocol_read(&status, sizeof(status));
885 status = ntohl(status);
886 if (status > INT_MAX) die("status 0x%lx does not fit in an int",
887 (unsigned long)status);
889 propagate_exit_status(status, "invocation");