2 * "Interpreter" that you can put in #! like this
3 * #!/usr/bin/prefork-interp [<options>] <interpreter>
6 * prefork-interp [<option> ..] <interpreter> [<script> [<args> ...]]
7 * prefork-interp [<option>,..],<interpreter> <script> [<args> ...]
8 * prefork-interp '[<option> ..] <interpreter>' <script> [<args> ...]
10 * Options must specify argument laundering mode.
11 * Currently the only mode supported is:
12 * -U unlaundered: setup and executor both get all arguments and env vars
13 * ident covers only env vars specified with -E
14 * ident covers only arguments interpreter and (if present) script
18 ***************************************************************************
20 State during service execution, process parentage and key fds
25 || listen watch-err/in
26 || call (accept) \ ,------2
27 || ,-----------------------------. SERVER -----0 WATCHER(C)
28 CLIENT 2--=fdpassed>=---------. \ || && | &&
29 (C) 1--=fdpassed>=---------. \ \ || inotify
30 0--=fdpassed>=---------. \ \ \ || sockpath
48 && session leader (daemon)
49 & process group leader
51 ***************************************************************************
53 Control flow and causality
61 attempt to connect, and read greeting
64 tidy up stale /run entries *1 (continue from send_fds, below)
67 retry attempt to connect, and read greeting
70 create listening socket release lock
73 | `------------------.
76 make "fake" initial call socketpair (C)
78 fork/exec #########################################################
79 | `-------------. application
82 | # script initialisation
84 | # ########|#############################################
86 | # identify fds from envirnment (Perl)
91 waitpid # fork for initial service
94 | # | SCRIPT [server] &&
96 | # | ** accept / event loop **
98 | # | / \ watch\ \idle
99 | # | fork child \stderr\ \timeout?
100 | # | _________/ | | |
102 | # SCRIPT [monitor] | eof?| |
105 read ,....<.....send greeting | | |
106 greeting # | ___________________
114 | # fork for executor (Perl)
115 | # |parent? \child? prefork-interp
116 | # | ######\############################
117 | # | # SCRIPT (executor) application
118 | # | # execute service
119 | # wait for read # |
120 | # (select) # terminates
122 | # | # kernel closes execterm
123 | # | ,......<....../|
126 | # | | ,......<...../
127 | # waitpid # _______________
129 | ,....<....,..send status #
130 read status # ________________ #
134 ********** Or, if client is killed **********
136 | # | # execute service
137 terminates # wait for read # |
142 _____________ # \|call? # |
144 # kill whole pgrp... # killled
146 # | | ,......<....../
147 # waitpid # _______________
150 # _____SIGPIPE______ #
152 | - \ / process control flow
153 ... < > causes mediated by fds or other IPC etc.
154 && session leader (daemon)
155 & process group leader
156 # language/implementation boundary
157 *1 line continued elsewhere
159 ______ process termination (after reaping, if shown)
161 ***************************************************************************
163 Sequence of events and fd pluming.
164 NB INCOMPLETE - does not cover execterm, cleanup
166 client (C wrapper) connects to server
167 (including reading ack byte)
169 === acquires lock ===
170 makes new listening socket
172 forks watcher and awaits
173 makes first-instance socketpair
174 forks setup (script, sock fds indicated in env)
175 fd0, fd1, fd2: from-outer
176 other fd: call(client-end)(fake)
177 reaps setup (and reports error)
178 (implicitly releases lock)
180 watcher fd[012]: watcher pipes
181 starts watch on socket path
182 sets stderr to line buffered
183 sets stdin to nonblocking
184 daemonises (one fork, becomes session leader)
185 when socket stat changes, quit
187 setup (pre-exec) fd0: null,
188 fd[12]: fd2-from-outer
189 env fds: listener, call(server-end)(fake),
190 watcher read, watcher write
192 possibly clean env, argv
194 setup (script) runs initialisation parts of the script
195 at prefork establishment point:
196 setup (pm) [1] opens syslog
200 server (pm) [1] [fd0: null],
201 [fd[12]: fd2-from-outer]
203 right away, forks init monitor
204 [2] closes outer caller fds and call(fake)
205 [server (pm)] fd[012]: null
206 other fds: listener, syslog
207 runs in loop accepting and forking,
208 reaping and limiting children (incl init monitor)
209 reports failures of monitors to syslog
211 [client (C wrapper)] if client connect succeeds:
212 now fd: call(client-end)
213 sends message with: cmdline, env
216 [server (script)] accepts, forks subseq monitor
218 monitor [1] [fd0: null]
219 (init [fd[12]: init: fd2-from-outer; subseq: null]
220 or errors: init: fd2; subseq: syslog
221 subseq) other fds: syslog, call(server-end)
223 receives args, env, fds
226 executor sorts out fds:
227 fd0, fd1, fd2: from-outer
228 close fds: call(server-end)
232 runs main part of script
235 [monitor] [fd[012]: null]
236 [fd[12]: init: fd2-from-outer; subseq: null]
237 [errors: init: fd2; subseq: syslog]
239 reports status via socket
241 [client (C wrapper)] [fd0, fd1, fd2: from-outer]
242 [other fd: call(client-end)]
243 receives status, exits appropriately
244 (if was bad signal, reports to stderr, exits 127)
246 ***************************************************************************
250 #include <arpa/inet.h>
256 const char our_name[] = "prefork-interp";
258 static struct sockaddr_un sockaddr_sun;
259 static FILE *call_sock;
261 #define ACK_BYTE '\n'
263 static const char *const *executor_argv;
265 static const char header_magic[4] = "PFI\n";
267 void fusagemessage(FILE *f) {
268 fprintf(f, "usage: #!/usr/bin/prefork-interp [<options>]\n");
271 #define MODE_NORMAL 0
272 #define MODE_KILL 'k'
273 #define MODE_FRESH 'f'
275 #define MEDIATION_UNSPECIFIED 0
276 #define MEDIATION_UNLAUNDERED 'U'
278 static int mediation = MEDIATION_UNSPECIFIED;
279 static int mode = MODE_NORMAL;
280 static int max_sockets = 100; // maximum entries in the run dir is 2x this
282 static struct stat initial_stab;
284 const struct cmdinfo cmdinfos[]= {
286 { 0, 'U', 0, .iassignto= &mediation, .arg= MEDIATION_UNLAUNDERED },
287 { "kill", 0, 0, .iassignto= &mode, .arg= MODE_KILL },
288 { 0, 'f', 0, .iassignto= &mode, .arg= MODE_FRESH },
292 void ident_addinit(void) {
293 char ident_magic[1] = { 0 };
294 sha256_update(&identsc, sizeof(ident_magic), ident_magic);
297 static void propagate_exit_status(int status, const char *what) {
300 if (WIFEXITED(status)) {
304 if (WIFSIGNALED(status)) {
305 int sig = WTERMSIG(status);
306 const char *signame = strsignal(sig);
307 if (signame == 0) signame = "unknown signal";
309 if (! WCOREDUMP(status) &&
316 sa.sa_handler = SIG_DFL;
317 r = sigaction(sig, &sa, 0);
318 if (r) diee("failed to reset signal handler while propagating %s",
323 sigaddset(&sset, sig);
324 r = sigprocmask(SIG_UNBLOCK, &sset, 0);
325 if (r) diee("failed to reset signal block while propagating %s",
329 die("unexpectedly kept running after raising (to propagate) %s",
333 die("%s failed due to signal %d %s%s", what, sig, signame,
334 WCOREDUMP(status) ? " (core dumped)" : "");
337 die("%s failed with weird wait status %d 0x%x", what, status, status);
345 static int preclean_entry_compar_name(const void *av, const void *bv) {
346 const PrecleanEntry *a = av;
347 const PrecleanEntry *b = bv;
348 return strcmp(a->name_hash, b->name_hash);
351 static int preclean_entry_compar_atime(const void *av, const void *bv) {
352 const PrecleanEntry *ae = av; time_t a = ae->atime;
353 const PrecleanEntry *be = bv; time_t b = be->atime;
358 static time_t preclean_stat_atime(const char *s_path) {
360 int r= lstat(s_path, &stab);
362 if (errno!=ENOENT) diee("pre-cleanup: stat socket (%s)", s_path);
365 return stab.st_atime;
368 static void preclean(void) {
369 DIR *dir = opendir(run_base);
371 if (errno == ENOENT) return;
372 diee("pre-cleanup: open run dir (%s)", run_base);
375 PrecleanEntry *entries=0;
376 size_t avail_entries=0;
377 size_t used_entries=0;
380 while ((errno = 0, de = readdir(dir))) {
381 char c0 = de->d_name[0];
382 if (!(c0 == 'l' || c0 == 's')) continue;
383 char *name_hash = m_asprintf("%s", de->d_name+1);
384 char *s_path = m_asprintf("%s/s%s", run_base, name_hash);
385 time_t atime = preclean_stat_atime(s_path);
387 if (avail_entries == used_entries) {
388 assert(avail_entries < INT_MAX / 4 / sizeof(PrecleanEntry));
391 entries = realloc(entries, avail_entries * sizeof(PrecleanEntry));
393 entries[used_entries].name_hash = name_hash;
394 entries[used_entries].atime = atime;
397 if (errno) diee("pre-cleanup: read run dir (%s)", run_base);
399 // First we dedupe (after sorting by path)
400 qsort(entries, used_entries, sizeof(PrecleanEntry),
401 preclean_entry_compar_name);
402 PrecleanEntry *p, *q;
403 for (p=entries, q=entries; p < entries + used_entries; p++) {
404 if (q > entries && !strcmp(p->name_hash, (q-1)->name_hash))
408 used_entries = q - entries;
410 // Now maybe delete some things
412 // Actually this has an off-by-one error since we are about
413 // to create a socket, so the actual number of sockets is one more.
414 // But, *actually*, since there might be multiple of us running at once,
415 // we might have even more than that. This doesn't really matter.
416 if (used_entries > max_sockets) {
417 qsort(entries, used_entries, sizeof(PrecleanEntry),
418 preclean_entry_compar_atime);
419 for (p=entries; p < entries + max_sockets; p++) {
420 char *l_path = m_asprintf("%s/l%s", run_base, p->name_hash);
421 char *s_path = m_asprintf("%s/s%s", run_base, p->name_hash);
422 int lock_fd = flock_file(l_path);
423 // Recheck atime - we might have raced!
424 time_t atime = preclean_stat_atime(s_path);
425 if (atime != p->atime) {
426 // Raced. This will leave use deleting too few things. Whatever.
428 int r= unlink(s_path);
429 if (r && errno!=ENOENT) diee("preclean: delete stale (%s)", s_path);
431 if (r) diee("preclean: delete stale lock (%s)", s_path);
432 // NB we don't hold the lock any more now.
440 for (p=entries; p < entries + used_entries; p++)
445 static __attribute((noreturn)) void die_data_overflow(void) {
446 die("cannot handle data with length >2^32");
449 static void prepare_data(size_t *len, char **buf,
450 const void *data, size_t dl) {
452 if (dl >= SIZE_MAX - *len)
457 memcpy(*buf, data, dl);
462 static void prepare_length(size_t *len, char **buf, size_t dl_sz) {
463 if (dl_sz > UINT32_MAX) die_data_overflow();
464 uint32_t dl = htonl(dl_sz);
465 prepare_data(len, buf, &dl, sizeof(dl));
468 static void prepare_string(size_t *len, char **buf, const char *s) {
469 size_t sl = strlen(s);
470 prepare_data(len, buf, s, sl+1);
473 static void prepare_message(size_t *len, char **buf) {
476 const char *const *p = (void*)environ;
479 prepare_string(len, buf, s);
482 prepare_string(len, buf, "");
486 prepare_string(len, buf, s);
489 static void send_fd(int payload_fd) {
490 int via_fd = fileno(call_sock);
493 struct cmsghdr align;
494 char buf[CMSG_SPACE(sizeof(payload_fd))];
504 iov.iov_base = &dummy_byte;
510 msg.msg_control = cmsg_buf.buf;
511 msg.msg_controllen = sizeof(cmsg_buf.buf);
513 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
514 cmsg->cmsg_level = SOL_SOCKET;
515 cmsg->cmsg_type = SCM_RIGHTS;
516 cmsg->cmsg_len = CMSG_LEN(sizeof(payload_fd));
517 *(int*)CMSG_DATA(cmsg) = payload_fd;
519 msg.msg_controllen = sizeof(cmsg_buf.buf);
522 ssize_t r = sendmsg(via_fd, &msg, 0);
524 if (errno == EINTR) continue;
532 static void send_request(void) {
534 ssize_t sr = fwrite(&ibyte, 1, 1, call_sock);
535 if (sr != 1) diee("write signalling byte");
537 // Sending these before the big message makes it easier for the script to
538 // use buffered IO for the message.
544 prepare_message(&len, 0);
546 size_t tlen = len + 4;
547 char *m = xmalloc(tlen);
549 prepare_length(0, &p, len);
550 prepare_message(0, &p);
551 assert(p == m + tlen);
553 sr = fwrite(m, tlen, 1, call_sock);
554 if (sr != 1) diee("write request (buffer)");
556 if (fflush(call_sock)) diee("write request");
559 static FILE *call_sock_from_fd(int fd) {
562 FILE *call_sock = fdopen(fd, "r+");
563 if (!call_sock) diee("fdopen socket");
565 r = setvbuf(call_sock, 0, _IONBF, 0);
566 if (r) die("setvbuf socket");
571 static bool was_eof(FILE *call_sock) {
572 return feof(call_sock) || errno==ECONNRESET;
576 static int protocol_read_maybe(void *data, size_t sz) {
578 size_t sr = fread(data, sz, 1, call_sock);
580 if (was_eof(call_sock)) return -1;
581 diee("read() on monitor call socket (%zd)", sz);
586 static void protocol_read(void *data, size_t sz) {
587 if (protocol_read_maybe(data, sz) < 0)
588 die("monitor process quit unexpectedly");
591 // Returns 0 if OK, error msg if peer was garbage.
592 static const char *read_greeting(void) {
593 char got_magic[sizeof(header_magic)];
595 if (protocol_read_maybe(&got_magic, sizeof(got_magic)) < 0)
596 return "initial monitor process quit";
598 if (memcmp(got_magic, header_magic, sizeof(header_magic)))
599 die("got unexpected protocol magic 0x%02x%02x%02x%02x",
600 got_magic[0], got_magic[1], got_magic[2], got_magic[3]);
603 protocol_read(&xdata_len, sizeof(xdata_len));
604 void *xdata = xmalloc(xdata_len);
605 protocol_read(xdata, xdata_len);
610 // Returns: call(client-end), or 0 to mean "is garbage"
611 // find_socket_path must have been called
612 static FILE *connect_existing(void) {
616 if (mode != MODE_NORMAL) return 0;
618 fd = socket(AF_UNIX, SOCK_STREAM, 0);
619 if (fd==-1) diee("socket() for client");
621 socklen_t salen = sizeof(sockaddr_sun);
622 r = connect(fd, (const struct sockaddr*)&sockaddr_sun, salen);
624 if (errno==ECONNREFUSED || errno==ENOENT) goto x_garbage;
625 diee("connect() %s", socket_path);
628 call_sock = call_sock_from_fd(fd);
637 if (call_sock) { fclose(call_sock); call_sock=0; }
638 if (fd >= 0) close(fd);
642 static void watcher_cb_stdin(uv_poll_t *handle, int status, int events) {
646 if ((errno = -status)) diee("watcher: poll stdin");
650 if (!(errno==EINTR || errno==EWOULDBLOCK || errno==EAGAIN))
651 diee("watcher: read sentinel stdin");
655 static void watcher_cb_sockpath(uv_fs_event_t *handle, const char *filename,
656 int events, int status) {
658 struct stat now_stab;
660 if ((errno = -status)) diee("watcher: poll stdin");
662 r= stat(socket_path, &now_stab);
664 if (errno==ENOENT) _exit(0);
665 if (errno==EINTR) continue;
666 diee("stat socket: %s", socket_path);
668 if (!stabs_same_inode(&now_stab, &initial_stab))
673 // On entry, stderr is still inherited, but 0 and 1 are the pipes
674 static __attribute__((noreturn))
675 void become_watcher(void) {
677 uv_poll_t uvhandle_stdin;
678 uv_fs_event_t uvhandle_sockpath;
683 errno= -uv_loop_init(&loop);
684 if (errno) diee("watcher: uv_loop_init");
686 errno= -uv_poll_init(&loop, &uvhandle_stdin, 0);
687 if (errno) diee("watcher: uv_poll_init");
688 errno= -uv_poll_start(&uvhandle_stdin,
689 UV_READABLE | UV_WRITABLE | UV_DISCONNECT,
691 if (errno) diee("watcher: uv_poll_start");
693 errno= -uv_fs_event_init(&loop, &uvhandle_sockpath);
694 if (errno) diee("watcher: uv_fs_event_init");
696 errno= -uv_fs_event_start(&uvhandle_sockpath, watcher_cb_sockpath,
698 if (errno) diee("watcher: uv_fs_event_start");
700 // OK everything is set up, let us daemonise
701 if (dup2(1,2) != 2) diee("watcher: set daemonised stderr");
702 r= setvbuf(stderr, 0, _IOLBF, BUFSIZ);
703 if (r) diee("watcher: setvbuf stderr");
705 pid_t child = fork();
706 if (child == (pid_t)-1) diee("watcher: fork");
709 if (setsid() == (pid_t)-1) diee("watcher: setsid");
711 r= uv_run(&loop, UV_RUN_DEFAULT);
712 die("uv_run returned (%d)", r);
715 static __attribute__((noreturn))
716 void become_setup(int sfd, int lockfd, int fake_pair[2],
717 int watcher_stdin, int watcher_stderr) {
720 int call_fd = fake_pair[1];
722 int null_0 = open("/dev/null", O_RDONLY); if (null_0 < 0) diee("open null");
723 if (dup2(null_0, 0)) diee("dup2 /dev/null onto stdin");
725 if (dup2(2, 1) != 1) die("dup2 stderr onto stdout");
729 // Extension could work like this:
731 // We could advertise a new protocol (perhaps one which is nearly entirely
732 // different after the connect) by putting a name for it comma-separated
733 // next to "v1". Simple extension can be done by having the script
734 // side say something about it in the ack xdata, which we currently ignore.
735 // Or we could add other extra data after v1.
736 putenv(m_asprintf("PREFORK_INTERP=v1,%jd.%09ld %d,%d,%d,%d",
737 (intmax_t)initial_stab.st_mtim.tv_sec,
738 (long)initial_stab.st_mtim.tv_nsec,
739 sfd, call_fd, watcher_stdin, watcher_stderr));
741 execvp(executor_argv[0], (char**)executor_argv);
742 diee("execute %s", executor_argv[0]);
745 static void connect_or_spawn(void) {
748 call_sock = connect_existing();
749 if (call_sock) return;
751 // We're going to make a new one, so clean out old ones
754 int lockfd = acquire_lock();
756 if (mode == MODE_KILL) {
757 r= unlink(socket_path);
758 if (r && errno != ENOENT) diee("remove socket %s", socket_path);
760 r= unlink(lock_path);
761 if (r) diee("rmeove lock %s", lock_path);
765 call_sock = connect_existing();
766 if (call_sock) { close(lockfd); return; }
768 // We must start a fresh one, and we hold the lock
770 r = unlink(socket_path);
771 if (r<0 && errno!=ENOENT)
772 diee("failed to remove stale socket %s", socket_path);
774 int sfd = socket(AF_UNIX, SOCK_STREAM, 0);
775 if (sfd<0) diee("socket() for new listener");
777 socklen_t salen = sizeof(sockaddr_sun);
778 r= bind(sfd, (const struct sockaddr*)&sockaddr_sun, salen);
779 if (r<0) diee("bind() on new listener");
781 r= stat(socket_path, &initial_stab);
782 if (r<0) diee("stat() fresh socket");
784 // We never want callers to get ECONNREFUSED. But:
785 // There is a race here: from my RTFM they may get ECONNREFUSED
786 // if they try between our bind() and listen(). But if they do, they'll
787 // acquire the lock (serialising with us) and retry, and then it will work.
788 r = listen(sfd, INT_MAX);
789 if (r<0) diee("listen() for new listener");
793 int watcher_stdin[2];
794 int watcher_stderr[2];
795 if (pipe(watcher_stdin) || pipe(watcher_stderr))
796 diee("pipe() for socket inode watcher");
798 pid_t watcher = fork();
799 if (watcher == (pid_t)-1) diee("fork for watcher");
803 close(watcher_stdin[1]);
804 close(watcher_stderr[0]);
805 if (dup2(watcher_stdin[0], 0) != 0 ||
806 dup2(watcher_stderr[1], 1) != 1)
807 diee("initial dup2() for watcher");
808 close(watcher_stdin[0]);
809 close(watcher_stderr[1]);
813 close(watcher_stdin[0]);
814 close(watcher_stderr[1]);
815 nonblock(watcher_stderr[0]);
820 r = socketpair(AF_UNIX, SOCK_STREAM, 0, fake_pair);
821 if (r<0) diee("socketpair() for fake initial connection");
823 pid_t setup_pid = fork();
824 if (setup_pid == (pid_t)-1) diee("fork for spawn setup");
825 if (!setup_pid) become_setup(sfd, lockfd, fake_pair,
826 watcher_stdin[1], watcher_stderr[0]);
830 call_sock = call_sock_from_fd(fake_pair[0]);
833 pid_t got = waitpid(setup_pid, &status, 0);
834 if (got == (pid_t)-1) diee("waitpid setup [%ld]", (long)setup_pid);
835 if (got != setup_pid) diee("waitpid setup [%ld] gave [%ld]!",
836 (long)setup_pid, (long)got);
837 if (status != 0) propagate_exit_status(status, "setup");
839 const char *emsg = read_greeting();
840 if (emsg) die("setup failed: %s", emsg);
846 static void make_executor_argv(const char *const *argv) {
848 case MEDIATION_UNLAUNDERED: break;
849 default: die("need -U (specifying unlaundered argument handling)");
853 #define EACH_NEW_ARG(EACH) { \
854 arg = interp; { EACH } \
855 if ((arg = script)) { EACH } \
856 const char *const *walk = argv; \
857 while ((arg = *walk++)) { EACH } \
861 EACH_NEW_ARG( (void)arg; count++; );
863 const char **out = calloc(count, sizeof(char*));
864 executor_argv = (const char* const*)out;
865 if (!executor_argv) diee("allocate for arguments");
867 EACH_NEW_ARG( *out++ = arg; );
871 int main(int argc_unused, const char *const *argv) {
878 // which ought to be passed on to the actual executor.
879 make_executor_argv(argv);
882 FILLZERO(sockaddr_sun);
883 sockaddr_sun.sun_family = AF_UNIX;
884 assert(strlen(socket_path) <= sizeof(sockaddr_sun.sun_path));
885 strncpy(sockaddr_sun.sun_path, socket_path, sizeof(sockaddr_sun.sun_path));
889 // We're committed now, send the request (or bail out)
893 protocol_read(&status, sizeof(status));
895 status = ntohl(status);
896 if (status > INT_MAX) die("status 0x%lx does not fit in an int",
897 (unsigned long)status);
899 propagate_exit_status(status, "invocation");