2 * "Interpreter" that you can put in #! like this
3 * #!/usr/bin/prefork-interp [<options>] <interpreter>
6 * prefork-interp [<option> ..] <interpreter> [<script> [<args> ...]]
7 * prefork-interp [<option>,..],<interpreter> <script> [<args> ...]
8 * prefork-interp '[<option> ..] <interpreter>' <script> [<args> ...]
10 * Options must specify argument laundering mode.
11 * Currently the only mode supported is:
12 * -U unlaundered: setup and executor both get all arguments and env vars
13 * ident covers only env vars specified with -E
14 * ident covers only arguments interpreter and (if present) script
18 ***************************************************************************
20 State during service execution, process parentage and key fds
25 || listen watch-err/in
26 || call (accept) \ ,------2
27 || ,-----------------------------. SERVER -----0 WATCHER(C)
28 CLIENT 2--=fdpassed>=---------. \ || && | &&
29 (C) 1--=fdpassed>=---------. \ \ || inotify
30 0--=fdpassed>=---------. \ \ \ || sockpath
48 && session leader (daemon)
49 & process group leader
51 ***************************************************************************
53 Control flow and causality
61 attempt to connect, and read greeting
64 tidy up stale /run entries *1 (continue from send_fds, below)
67 retry attempt to connect, and read greeting
70 create listening socket release lock
73 | `------------------.
76 make "fake" initial call socketpair (C)
78 fork/exec #########################################################
79 | `-------------. application
82 | # script initialisation
84 | # ########|#############################################
86 | # identify fds from envirnment (Perl)
91 waitpid # fork for initial service
94 | # | SCRIPT [server] &&
96 | # | ** accept / event loop **
98 | # | / \ watch\ \idle
99 | # | fork child \stderr\ \timeout?
100 | # | _________/ | | |
102 | # SCRIPT [monitor] | eof?| |
105 read ,....<.....send greeting | | |
106 greeting # | ___________________
114 | # fork for executor (Perl)
115 | # |parent? \child? prefork-interp
116 | # | ######\############################
117 | # | # SCRIPT (executor) application
118 | # | # execute service
119 | # wait for read # |
120 | # (select) # terminates
122 | # | # kernel closes execterm
123 | # | ,......<....../|
126 | # | | ,......<...../
127 | # waitpid # _______________
129 | ,....<....,..send status #
130 read status # ________________ #
134 ********** Or, if client is killed **********
136 | # | # execute service
137 terminates # wait for read # |
142 _____________ # \|call? # |
144 # kill whole pgrp... # killled
146 # | | ,......<....../
147 # waitpid # _______________
150 # _____SIGPIPE______ #
152 | - \ / process control flow
153 ... < > causes mediated by fds or other IPC etc.
154 && session leader (daemon)
155 & process group leader
156 # language/implementation boundary
157 *1 line continued elsewhere
159 ______ process termination (after reaping, if shown)
161 ***************************************************************************
163 Sequence of events and fd pluming.
164 NB INCOMPLETE - does not cover execterm, cleanup
166 client (C wrapper) connects to server
167 (including reading ack byte)
169 === acquires lock ===
170 makes new listening socket
172 forks watcher and awaits
173 makes first-instance socketpair
174 forks setup (script, sock fds indicated in env)
175 fd0, fd1, fd2: from-outer
176 other fd: call(client-end)(fake)
177 reaps setup (and reports error)
178 (implicitly releases lock)
180 watcher fd[012]: watcher pipes
181 starts watch on socket path
182 sets stderr to line buffered
183 sets stdin to nonblocking
184 daemonises (one fork, becomes session leader)
185 when socket stat changes, quit
187 setup (pre-exec) fd0: null,
188 fd[12]: fd2-from-outer
189 env fds: listener, call(server-end)(fake),
190 watcher read, watcher write
192 possibly clean env, argv
194 setup (script) runs initialisation parts of the script
195 at prefork establishment point:
196 setup (pm) [1] opens syslog
200 server (pm) [1] [fd0: null],
201 [fd[12]: fd2-from-outer]
203 right away, forks init monitor
204 [2] closes outer caller fds and call(fake)
205 [server (pm)] fd[012]: null
206 other fds: listener, syslog
207 runs in loop accepting and forking,
208 reaping and limiting children (incl init monitor)
209 reports failures of monitors to syslog
211 [client (C wrapper)] if client connect succeeds:
212 now fd: call(client-end)
213 sends message with: cmdline, env
216 [server (script)] accepts, forks subseq monitor
218 monitor [1] [fd0: null]
219 (init [fd[12]: init: fd2-from-outer; subseq: null]
220 or errors: init: fd2; subseq: syslog
221 subseq) other fds: syslog, call(server-end)
223 receives args, env, fds
226 executor sorts out fds:
227 fd0, fd1, fd2: from-outer
228 close fds: call(server-end)
232 runs main part of script
235 [monitor] [fd[012]: null]
236 [fd[12]: init: fd2-from-outer; subseq: null]
237 [errors: init: fd2; subseq: syslog]
239 reports status via socket
241 [client (C wrapper)] [fd0, fd1, fd2: from-outer]
242 [other fd: call(client-end)]
243 receives status, exits appropriately
244 (if was bad signal, reports to stderr, exits 127)
246 ***************************************************************************
250 #include <arpa/inet.h>
256 const char our_name[] = "prefork-interp";
258 static struct sockaddr_un sockaddr_sun;
259 static FILE *call_sock;
261 #define ACK_BYTE '\n'
263 static const char *const *executor_argv;
265 static const char header_magic[4] = "PFI\n";
267 void fusagemessage(FILE *f) {
268 fprintf(f, "usage: #!/usr/bin/prefork-interp [<options>]\n");
271 #define MODE_NORMAL 0
272 #define MODE_KILL 'k'
273 #define MODE_FRESH 'f'
275 #define MEDIATION_UNSPECIFIED 0
276 #define MEDIATION_UNLAUNDERED 'U'
278 static int mediation = MEDIATION_UNSPECIFIED;
279 static int mode = MODE_NORMAL;
280 static int max_sockets = 100; // maximum entries in the run dir is 2x this
282 static struct stat initial_stab;
284 const struct cmdinfo cmdinfos[]= {
286 { 0, 'U', 0, .iassignto= &mediation, .arg= MEDIATION_UNLAUNDERED },
287 { "kill", 0, 0, .iassignto= &mode, .arg= MODE_KILL },
288 { 0, 'f', 0, .iassignto= &mode, .arg= MODE_FRESH },
292 void ident_addinit(void) {
293 char ident_magic[1] = { 0 };
294 sha256_update(&identsc, sizeof(ident_magic), ident_magic);
297 static void propagate_exit_status(int status, const char *what) {
300 if (WIFEXITED(status)) {
304 if (WIFSIGNALED(status)) {
305 int sig = WTERMSIG(status);
306 const char *signame = strsignal(sig);
307 if (signame == 0) signame = "unknown signal";
309 if (! WCOREDUMP(status) &&
316 sa.sa_handler = SIG_DFL;
317 r = sigaction(sig, &sa, 0);
318 if (r) diee("failed to reset signal handler while propagating %s",
323 sigaddset(&sset, sig);
324 r = sigprocmask(SIG_UNBLOCK, &sset, 0);
325 if (r) diee("failed to reset signal block while propagating %s",
329 die("unexpectedly kept running after raising (to propagate) %s",
333 die("%s failed due to signal %d %s%s", what, sig, signame,
334 WCOREDUMP(status) ? " (core dumped)" : "");
337 die("%s failed with weird wait status %d 0x%x", what, status, status);
345 static int preclean_entry_compar_name(const void *av, const void *bv) {
346 const PrecleanEntry *a = av;
347 const PrecleanEntry *b = bv;
348 return strcmp(a->name_hash, b->name_hash);
351 static int preclean_entry_compar_atime(const void *av, const void *bv) {
352 const PrecleanEntry *ae = av; time_t a = ae->atime;
353 const PrecleanEntry *be = bv; time_t b = be->atime;
358 static time_t preclean_stat_atime(const char *s_path) {
360 int r= lstat(s_path, &stab);
362 if (errno!=ENOENT) diee("pre-cleanup: stat socket (%s)", s_path);
365 return stab.st_atime;
368 static void preclean(void) {
369 DIR *dir = opendir(run_base);
371 if (errno == ENOENT) return;
372 diee("pre-cleanup: open run dir (%s)", run_base);
375 PrecleanEntry *entries=0;
376 size_t avail_entries=0;
377 size_t used_entries=0;
380 while ((errno = 0, de = readdir(dir))) {
381 char c0 = de->d_name[0];
382 if (!(c0 == 'l' || c0 == 's')) continue;
383 char *name_hash = m_asprintf("%s", de->d_name+1);
384 char *s_path = m_asprintf("%s/s%s", run_base, name_hash);
385 time_t atime = preclean_stat_atime(s_path);
387 if (avail_entries == used_entries) {
388 assert(avail_entries < INT_MAX / 4 / sizeof(PrecleanEntry));
391 entries = realloc(entries, avail_entries * sizeof(PrecleanEntry));
393 entries[used_entries].name_hash = name_hash;
394 entries[used_entries].atime = atime;
397 if (errno) diee("pre-cleanup: read run dir (%s)", run_base);
399 // First we dedupe (after sorting by path)
400 qsort(entries, used_entries, sizeof(PrecleanEntry),
401 preclean_entry_compar_name);
402 PrecleanEntry *p, *q;
403 for (p=entries, q=entries; p < entries + used_entries; p++) {
404 if (q > entries && !strcmp(p->name_hash, (q-1)->name_hash))
408 used_entries = q - entries;
410 // Now maybe delete some things
412 // Actually this has an off-by-one error since we are about
413 // to create a socket, so the actual number of sockets is one more.
414 // But, *actually*, since there might be multiple of us running at once,
415 // we might have even more than that. This doesn't really matter.
416 if (used_entries > max_sockets) {
417 qsort(entries, used_entries, sizeof(PrecleanEntry),
418 preclean_entry_compar_atime);
419 for (p=entries; p < entries + max_sockets; p++) {
420 char *l_path = m_asprintf("%s/l%s", run_base, p->name_hash);
421 char *s_path = m_asprintf("%s/s%s", run_base, p->name_hash);
422 int lock_fd = flock_file(l_path);
423 // Recheck atime - we might have raced!
424 time_t atime = preclean_stat_atime(s_path);
425 if (atime != p->atime) {
426 // Raced. This will leave use deleting too few things. Whatever.
428 int r= unlink(s_path);
429 if (r && errno!=ENOENT) diee("preclean: delete stale (%s)", s_path);
431 if (r) diee("preclean: delete stale lock (%s)", s_path);
432 // NB we don't hold the lock any more now.
440 for (p=entries; p < entries + used_entries; p++)
445 static __attribute((noreturn)) void die_data_overflow(void) {
446 die("cannot handle data with length >2^32");
449 static void prepare_data(size_t *len, char **buf,
450 const void *data, size_t dl) {
452 if (dl >= SIZE_MAX - *len)
457 memcpy(*buf, data, dl);
462 static void prepare_length(size_t *len, char **buf, size_t dl_sz) {
463 if (dl_sz > UINT32_MAX) die_data_overflow();
464 uint32_t dl = htonl(dl_sz);
465 prepare_data(len, buf, &dl, sizeof(dl));
468 static void prepare_string(size_t *len, char **buf, const char *s) {
469 size_t sl = strlen(s);
470 prepare_data(len, buf, s, sl+1);
473 static void prepare_message(size_t *len, char **buf) {
476 const char *const *p = (void*)environ;
479 prepare_string(len, buf, s);
482 prepare_string(len, buf, "");
486 prepare_string(len, buf, s);
489 static void send_fd(int payload_fd) {
490 int via_fd = fileno(call_sock);
493 struct cmsghdr align;
494 char buf[CMSG_SPACE(sizeof(payload_fd))];
504 iov.iov_base = &dummy_byte;
510 msg.msg_control = cmsg_buf.buf;
511 msg.msg_controllen = sizeof(cmsg_buf.buf);
513 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
514 cmsg->cmsg_level = SOL_SOCKET;
515 cmsg->cmsg_type = SCM_RIGHTS;
516 cmsg->cmsg_len = CMSG_LEN(sizeof(payload_fd));
517 *(int*)CMSG_DATA(cmsg) = payload_fd;
519 msg.msg_controllen = sizeof(cmsg_buf.buf);
522 ssize_t r = sendmsg(via_fd, &msg, 0);
524 if (errno == EINTR) continue;
532 static void send_request(void) {
533 // Sending these first makes it easier for the script to
534 // use buffered IO for the message.
540 prepare_message(&len, 0);
542 size_t tlen = len + 4;
543 char *m = xmalloc(tlen);
545 prepare_length(0, &p, len);
546 prepare_message(0, &p);
547 assert(p == m + tlen);
549 ssize_t sr = fwrite(m, tlen, 1, call_sock);
550 if (sr != 1) diee("write request (buffer)");
552 if (fflush(call_sock)) diee("write request");
555 static FILE *call_sock_from_fd(int fd) {
558 FILE *call_sock = fdopen(fd, "r+");
559 if (!call_sock) diee("fdopen socket");
561 r = setvbuf(call_sock, 0, _IONBF, 0);
562 if (r) die("setvbuf socket");
567 static bool was_eof(FILE *call_sock) {
568 return feof(call_sock) || errno==ECONNRESET;
572 static int protocol_read_maybe(void *data, size_t sz) {
574 size_t sr = fread(data, sz, 1, call_sock);
576 if (was_eof(call_sock)) return -1;
577 diee("read() on monitor call socket (%zd)", sz);
582 static void protocol_read(void *data, size_t sz) {
583 if (protocol_read_maybe(data, sz) < 0)
584 die("monitor process quit unexpectedly");
587 // Returns 0 if OK, error msg if peer was garbage.
588 static const char *read_greeting(void) {
589 char got_magic[sizeof(header_magic)];
591 if (protocol_read_maybe(&got_magic, sizeof(got_magic)) < 0)
592 return "initial monitor process quit";
594 if (memcmp(got_magic, header_magic, sizeof(header_magic)))
595 die("got unexpected protocol magic 0x%02x%02x%02x%02x",
596 got_magic[0], got_magic[1], got_magic[2], got_magic[3]);
599 protocol_read(&xdata_len, sizeof(xdata_len));
600 void *xdata = xmalloc(xdata_len);
601 protocol_read(xdata, xdata_len);
606 // Returns: call(client-end), or 0 to mean "is garbage"
607 // find_socket_path must have been called
608 static FILE *connect_existing(void) {
612 if (mode != MODE_NORMAL) return 0;
614 fd = socket(AF_UNIX, SOCK_STREAM, 0);
615 if (fd==-1) diee("socket() for client");
617 socklen_t salen = sizeof(sockaddr_sun);
618 r = connect(fd, (const struct sockaddr*)&sockaddr_sun, salen);
620 if (errno==ECONNREFUSED || errno==ENOENT) goto x_garbage;
621 diee("connect() %s", socket_path);
624 call_sock = call_sock_from_fd(fd);
633 if (call_sock) { fclose(call_sock); call_sock=0; }
634 if (fd >= 0) close(fd);
638 static void watcher_cb_stdin(uv_poll_t *handle, int status, int events) {
642 if ((errno = -status)) diee("watcher: poll stdin");
646 if (!(errno==EINTR || errno==EWOULDBLOCK || errno==EAGAIN))
647 diee("watcher: read sentinel stdin");
651 static void watcher_cb_sockpath(uv_fs_event_t *handle, const char *filename,
652 int events, int status) {
654 struct stat now_stab;
656 if ((errno = -status)) diee("watcher: poll stdin");
658 r= stat(socket_path, &now_stab);
660 if (errno==ENOENT) _exit(0);
661 if (errno==EINTR) continue;
662 diee("stat socket: %s", socket_path);
664 if (!stabs_same_inode(&now_stab, &initial_stab))
669 // On entry, stderr is still inherited, but 0 and 1 are the pipes
670 static __attribute__((noreturn))
671 void become_watcher(void) {
673 uv_poll_t uvhandle_stdin;
674 uv_fs_event_t uvhandle_sockpath;
679 errno= -uv_loop_init(&loop);
680 if (errno) diee("watcher: uv_loop_init");
682 errno= -uv_poll_init(&loop, &uvhandle_stdin, 0);
683 if (errno) diee("watcher: uv_poll_init");
684 errno= -uv_poll_start(&uvhandle_stdin,
685 UV_READABLE | UV_WRITABLE | UV_DISCONNECT,
687 if (errno) diee("watcher: uv_poll_start");
689 errno= -uv_fs_event_init(&loop, &uvhandle_sockpath);
690 if (errno) diee("watcher: uv_fs_event_init");
692 errno= -uv_fs_event_start(&uvhandle_sockpath, watcher_cb_sockpath,
694 if (errno) diee("watcher: uv_fs_event_start");
696 // OK everything is set up, let us daemonise
697 if (dup2(1,2) != 2) diee("watcher: set daemonised stderr");
698 r= setvbuf(stderr, 0, _IOLBF, BUFSIZ);
699 if (r) diee("watcher: setvbuf stderr");
701 pid_t child = fork();
702 if (child == (pid_t)-1) diee("watcher: fork");
705 if (setsid() == (pid_t)-1) diee("watcher: setsid");
707 r= uv_run(&loop, UV_RUN_DEFAULT);
708 die("uv_run returned (%d)", r);
711 static __attribute__((noreturn))
712 void become_setup(int sfd, int lockfd, int fake_pair[2],
713 int watcher_stdin, int watcher_stderr) {
716 int call_fd = fake_pair[1];
718 int null_0 = open("/dev/null", O_RDONLY); if (null_0 < 0) diee("open null");
719 if (dup2(null_0, 0)) diee("dup2 /dev/null onto stdin");
721 if (dup2(2, 1) != 1) die("dup2 stderr onto stdout");
725 // Extension could work like this:
727 // We advertise a new protocol (perhaps one which is nearly entirely
728 // different after the connect) by putting a name for it comma-separated
729 // next to "v1". Simple extension can be done by having the script
730 // side say something about it in the ack xdata, which we currently ignore.
731 putenv(m_asprintf("PREFORK_INTERP=v1 %d,%d,%d,%d",
732 sfd, call_fd, watcher_stdin, watcher_stderr));
734 execvp(executor_argv[0], (char**)executor_argv);
735 diee("execute %s", executor_argv[0]);
738 static void connect_or_spawn(void) {
741 call_sock = connect_existing();
742 if (call_sock) return;
744 // We're going to make a new one, so clean out old ones
747 int lockfd = acquire_lock();
749 if (mode == MODE_KILL) {
750 r= unlink(socket_path);
751 if (r && errno != ENOENT) diee("remove socket %s", socket_path);
753 r= unlink(lock_path);
754 if (r) diee("rmeove lock %s", lock_path);
758 call_sock = connect_existing();
759 if (call_sock) { close(lockfd); return; }
761 // We must start a fresh one, and we hold the lock
763 r = unlink(socket_path);
764 if (r<0 && errno!=ENOENT)
765 diee("failed to remove stale socket %s", socket_path);
767 int sfd = socket(AF_UNIX, SOCK_STREAM, 0);
768 if (sfd<0) diee("socket() for new listener");
770 socklen_t salen = sizeof(sockaddr_sun);
771 r= bind(sfd, (const struct sockaddr*)&sockaddr_sun, salen);
772 if (r<0) diee("bind() on new listener");
774 r= stat(socket_path, &initial_stab);
775 if (r<0) diee("stat() fresh socket");
777 // We never want callers to get ECONNREFUSED. But:
778 // There is a race here: from my RTFM they may get ECONNREFUSED
779 // if they try between our bind() and listen(). But if they do, they'll
780 // acquire the lock (serialising with us) and retry, and then it will work.
781 r = listen(sfd, INT_MAX);
782 if (r<0) diee("listen() for new listener");
786 int watcher_stdin[2];
787 int watcher_stderr[2];
788 if (pipe(watcher_stdin) || pipe(watcher_stderr))
789 diee("pipe() for socket inode watcher");
791 pid_t watcher = fork();
792 if (watcher == (pid_t)-1) diee("fork for watcher");
796 close(watcher_stdin[1]);
797 close(watcher_stderr[0]);
798 if (dup2(watcher_stdin[0], 0) != 0 ||
799 dup2(watcher_stderr[1], 1) != 1)
800 diee("initial dup2() for watcher");
801 close(watcher_stdin[0]);
802 close(watcher_stderr[1]);
806 close(watcher_stdin[0]);
807 close(watcher_stderr[1]);
808 nonblock(watcher_stderr[0]);
813 r = socketpair(AF_UNIX, SOCK_STREAM, 0, fake_pair);
814 if (r<0) diee("socketpair() for fake initial connection");
816 pid_t setup_pid = fork();
817 if (setup_pid == (pid_t)-1) diee("fork for spawn setup");
818 if (!setup_pid) become_setup(sfd, lockfd, fake_pair,
819 watcher_stdin[1], watcher_stderr[0]);
823 call_sock = call_sock_from_fd(fake_pair[0]);
826 pid_t got = waitpid(setup_pid, &status, 0);
827 if (got == (pid_t)-1) diee("waitpid setup [%ld]", (long)setup_pid);
828 if (got != setup_pid) diee("waitpid setup [%ld] gave [%ld]!",
829 (long)setup_pid, (long)got);
830 if (status != 0) propagate_exit_status(status, "setup");
832 const char *emsg = read_greeting();
833 if (emsg) die("setup failed: %s", emsg);
839 static void make_executor_argv(const char *const *argv) {
841 case MEDIATION_UNLAUNDERED: break;
842 default: die("need -U (specifying unlaundered argument handling)");
846 #define EACH_NEW_ARG(EACH) { \
847 arg = interp; { EACH } \
848 if ((arg = script)) { EACH } \
849 const char *const *walk = argv; \
850 while ((arg = *walk++)) { EACH } \
854 EACH_NEW_ARG( (void)arg; count++; );
856 const char **out = calloc(count, sizeof(char*));
857 executor_argv = (const char* const*)out;
858 if (!executor_argv) diee("allocate for arguments");
860 EACH_NEW_ARG( *out++ = arg; );
864 int main(int argc_unused, const char *const *argv) {
871 // which ought to be passed on to the actual executor.
872 make_executor_argv(argv);
875 FILLZERO(sockaddr_sun);
876 sockaddr_sun.sun_family = AF_UNIX;
877 assert(strlen(socket_path) <= sizeof(sockaddr_sun.sun_path));
878 strncpy(sockaddr_sun.sun_path, socket_path, sizeof(sockaddr_sun.sun_path));
882 // We're committed now, send the request (or bail out)
886 protocol_read(&status, sizeof(status));
888 status = ntohl(status);
889 if (status > INT_MAX) die("status 0x%lx does not fit in an int",
890 (unsigned long)status);
892 propagate_exit_status(status, "invocation");