2 * "Interpreter" that you can put in #! like this
3 * #!/usr/bin/prefork-interp [<options>] <interpreter>
6 * prefork-interp [<option> ..] <interpreter> [<script> [<args> ...]]
7 * prefork-interp [<option>,..],<interpreter> <script> [<args> ...]
8 * prefork-interp '[<option> ..] <interpreter>' <script> [<args> ...]
10 * Options must specify argument laundering mode.
11 * Currently the only mode supported is:
12 * -U unlaundered: setup and executor both get all arguments and env vars
13 * ident covers only env vars specified with -E
14 * ident covers only arguments interpreter and (if present) script
18 ***************************************************************************
20 State during service execution, process parentage and key fds
25 || listen watch-err/in
26 || call (accept) \ ,------2
27 || ,-----------------------------. SERVER -----0 WATCHER(C)
28 CLIENT 2--=fdpassed>=---------. \ || && | &&
29 (C) 1--=fdpassed>=---------. \ \ || inotify
30 0--=fdpassed>=---------. \ \ \ || sockpath
48 && session leader (daemon)
49 & process group leader
51 ***************************************************************************
53 Control flow and causality
61 attempt to connect, and read greeting
64 tidy up stale /run entries *1 (continue from send_fds, below)
67 retry attempt to connect, and read greeting
70 create listening socket release lock
73 | `------------------.
76 make "fake" initial call socketpair (C)
78 fork/exec #########################################################
79 | `-------------. application
82 | # script initialisation
84 | # ########|#############################################
86 | # identify fds from envirnment (Perl)
91 waitpid # fork for initial service
94 | # | SCRIPT [server] &&
96 | # | ** accept / event loop **
98 | # | / \ watch\ \idle
99 | # | fork child \stderr\ \timeout?
100 | # | _________/ | | |
102 | # SCRIPT [monitor] | eof?| |
105 read ,....<.....send greeting | | |
106 greeting # | ___________________
114 | # fork for executor (Perl)
115 | # |parent? \child? prefork-interp
116 | # | ######\############################
117 | # | # SCRIPT (executor) application
118 | # | # execute service
119 | # wait for read # |
120 | # (select) # terminates
122 | # | # kernel closes execterm
123 | # | ,......<....../|
126 | # | | ,......<...../
127 | # waitpid # _______________
129 | ,....<....,..send status #
130 read status # ________________ #
134 ********** Or, if client is killed **********
136 | # | # execute service
137 terminates # wait for read # |
142 _____________ # \|call? # |
144 # kill whole pgrp... # killled
146 # | | ,......<....../
147 # waitpid # _______________
150 # _____SIGPIPE______ #
152 | - \ / process control flow
153 ... < > causes mediated by fds or other IPC etc.
154 && session leader (daemon)
155 & process group leader
156 # language/implementation boundary
157 *1 line continued elsewhere
159 ______ process termination (after reaping, if shown)
161 ***************************************************************************
163 Sequence of events and fd pluming.
164 NB INCOMPLETE - does not cover execterm, cleanup
166 client (C wrapper) connects to server
167 (including reading ack byte)
169 === acquires lock ===
170 makes new listening socket
172 forks watcher and awaits
173 makes first-instance socketpair
174 forks setup (script, sock fds indicated in env)
175 fd0, fd1, fd2: from-outer
176 other fd: call(client-end)(fake)
177 reaps setup (and reports error)
178 (implicitly releases lock)
180 watcher fd[012]: watcher pipes
181 starts watch on socket path
182 sets stderr to line buffered
183 sets stdin to nonblocking
184 daemonises (one fork, becomes session leader)
185 when socket stat changes, quit
187 setup (pre-exec) fd0: null,
188 fd[12]: fd2-from-outer
189 env fds: listener, call(server-end)(fake),
190 watcher read, watcher write
192 possibly clean env, argv
194 setup (script) runs initialisation parts of the script
195 at prefork establishment point:
196 setup (pm) [1] opens syslog
200 server (pm) [1] [fd0: null],
201 [fd[12]: fd2-from-outer]
203 right away, forks init monitor
204 [2] closes outer caller fds and call(fake)
205 [server (pm)] fd[012]: null
206 other fds: listener, syslog
207 runs in loop accepting and forking,
208 reaping and limiting children (incl init monitor)
209 reports failures of monitors to syslog
211 [client (C wrapper)] if client connect succeeds:
212 now fd: call(client-end)
213 sends message with: cmdline, env
216 [server (script)] accepts, forks subseq monitor
218 monitor [1] [fd0: null]
219 (init [fd[12]: init: fd2-from-outer; subseq: null]
220 or errors: init: fd2; subseq: syslog
221 subseq) other fds: syslog, call(server-end)
223 receives args, env, fds
226 executor sorts out fds:
227 fd0, fd1, fd2: from-outer
228 close fds: call(server-end)
232 runs main part of script
235 [monitor] [fd[012]: null]
236 [fd[12]: init: fd2-from-outer; subseq: null]
237 [errors: init: fd2; subseq: syslog]
239 reports status via socket
241 [client (C wrapper)] [fd0, fd1, fd2: from-outer]
242 [other fd: call(client-end)]
243 receives status, exits appropriately
244 (if was bad signal, reports to stderr, exits 127)
246 ***************************************************************************
250 #include <arpa/inet.h>
256 const char our_name[] = "prefork-interp";
258 static struct sockaddr_un sockaddr_sun;
259 static FILE *call_sock;
261 #define ACK_BYTE '\n'
263 static const char *const *executor_argv;
265 static const char header_magic[4] = "PFI\n";
267 void fusagemessage(FILE *f) {
268 fprintf(f, "usage: #!/usr/bin/prefork-interp [<options>]\n");
271 static int laundering;
272 static int max_sockets = 100; // maximum entries in the run dir is 2x this
274 static struct stat initial_stab;
276 const struct cmdinfo cmdinfos[]= {
278 { 0, 'U', 0, .iassignto= &laundering, .arg= 'U' },
282 void ident_addinit(void) {
283 char ident_magic[1] = { 0 };
284 sha256_update(&identsc, sizeof(ident_magic), ident_magic);
287 static void propagate_exit_status(int status, const char *what) {
290 if (WIFEXITED(status)) {
294 if (WIFSIGNALED(status)) {
295 int sig = WTERMSIG(status);
296 const char *signame = strsignal(sig);
297 if (signame == 0) signame = "unknown signal";
299 if (! WCOREDUMP(status) &&
306 sa.sa_handler = SIG_DFL;
307 r = sigaction(sig, &sa, 0);
308 if (r) diee("failed to reset signal handler while propagating %s",
313 sigaddset(&sset, sig);
314 r = sigprocmask(SIG_UNBLOCK, &sset, 0);
315 if (r) diee("failed to reset signal block while propagating %s",
319 die("unexpectedly kept running after raising (to propagate) %s",
323 die("%s failed due to signal %d %s%s", what, sig, signame,
324 WCOREDUMP(status) ? " (core dumped)" : "");
327 die("%s failed with weird wait status %d 0x%x", what, status, status);
335 static int preclean_entry_compar_name(const void *av, const void *bv) {
336 const PrecleanEntry *a = av;
337 const PrecleanEntry *b = bv;
338 return strcmp(a->name_hash, b->name_hash);
341 static int preclean_entry_compar_atime(const void *av, const void *bv) {
342 const PrecleanEntry *ae = av; time_t a = ae->atime;
343 const PrecleanEntry *be = bv; time_t b = be->atime;
348 static time_t preclean_stat_atime(const char *s_path) {
350 int r= lstat(s_path, &stab);
352 if (errno!=ENOENT) diee("pre-cleanup: stat socket (%s)", s_path);
355 return stab.st_atime;
358 static void preclean(void) {
359 DIR *dir = opendir(run_base);
361 if (errno == ENOENT) return;
362 diee("pre-cleanup: open run dir (%s)", run_base);
365 PrecleanEntry *entries=0;
366 size_t avail_entries=0;
367 size_t used_entries=0;
370 while ((errno = 0, de = readdir(dir))) {
371 char c0 = de->d_name[0];
372 if (!(c0 == 'l' || c0 == 's')) continue;
373 char *name_hash = m_asprintf("%s", de->d_name+1);
374 char *s_path = m_asprintf("%s/s%s", run_base, name_hash);
375 time_t atime = preclean_stat_atime(s_path);
377 if (avail_entries == used_entries) {
378 assert(avail_entries < INT_MAX / 4 / sizeof(PrecleanEntry));
381 entries = realloc(entries, avail_entries * sizeof(PrecleanEntry));
383 entries[used_entries].name_hash = name_hash;
384 entries[used_entries].atime = atime;
387 if (errno) diee("pre-cleanup: read run dir (%s)", run_base);
389 // First we dedupe (after sorting by path)
390 qsort(entries, used_entries, sizeof(PrecleanEntry),
391 preclean_entry_compar_name);
392 PrecleanEntry *p, *q;
393 for (p=entries, q=entries; p < entries + used_entries; p++) {
394 if (q > entries && !strcmp(p->name_hash, (q-1)->name_hash))
398 used_entries = q - entries;
400 // Now maybe delete some things
402 // Actually this has an off-by-one error since we are about
403 // to create a socket, so the actual number of sockets is one more.
404 // But, *actually*, since there might be multiple of us running at once,
405 // we might have even more than that. This doesn't really matter.
406 if (used_entries > max_sockets) {
407 qsort(entries, used_entries, sizeof(PrecleanEntry),
408 preclean_entry_compar_atime);
409 for (p=entries; p < entries + max_sockets; p++) {
410 char *l_path = m_asprintf("%s/l%s", run_base, p->name_hash);
411 char *s_path = m_asprintf("%s/s%s", run_base, p->name_hash);
412 int lock_fd = flock_file(l_path);
413 // Recheck atime - we might have raced!
414 time_t atime = preclean_stat_atime(s_path);
415 if (atime != p->atime) {
416 // Raced. This will leave use deleting too few things. Whatever.
418 int r= unlink(s_path);
419 if (r && errno!=ENOENT) diee("preclean: delete stale (%s)", s_path);
421 if (r) diee("preclean: delete stale lock (%s)", s_path);
422 // NB we don't hold the lock any more now.
430 for (p=entries; p < entries + used_entries; p++)
435 static __attribute((noreturn)) void die_data_overflow(void) {
436 die("cannot handle data with length >2^32");
439 static void prepare_data(size_t *len, char **buf,
440 const void *data, size_t dl) {
442 if (dl >= SIZE_MAX - *len)
447 memcpy(*buf, data, dl);
452 static void prepare_length(size_t *len, char **buf, size_t dl_sz) {
453 if (dl_sz > UINT32_MAX) die_data_overflow();
454 uint32_t dl = htonl(dl_sz);
455 prepare_data(len, buf, &dl, sizeof(dl));
458 static void prepare_string(size_t *len, char **buf, const char *s) {
459 size_t sl = strlen(s);
460 prepare_data(len, buf, s, sl+1);
463 static void prepare_message(size_t *len, char **buf) {
466 const char *const *p = (void*)environ;
469 prepare_string(len, buf, s);
472 prepare_string(len, buf, "");
476 prepare_string(len, buf, s);
479 static void send_fd(int payload_fd) {
480 int via_fd = fileno(call_sock);
483 struct cmsghdr align;
484 char buf[CMSG_SPACE(sizeof(payload_fd))];
494 iov.iov_base = &dummy_byte;
500 msg.msg_control = cmsg_buf.buf;
501 msg.msg_controllen = sizeof(cmsg_buf.buf);
503 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
504 cmsg->cmsg_level = SOL_SOCKET;
505 cmsg->cmsg_type = SCM_RIGHTS;
506 cmsg->cmsg_len = CMSG_LEN(sizeof(payload_fd));
507 *(int*)CMSG_DATA(cmsg) = payload_fd;
509 msg.msg_controllen = sizeof(cmsg_buf.buf);
512 ssize_t r = sendmsg(via_fd, &msg, 0);
514 if (errno == EINTR) continue;
522 static void send_request(void) {
523 // Sending these first makes it easier for the script to
524 // use buffered IO for the message.
530 prepare_message(&len, 0);
532 size_t tlen = len + 4;
533 char *m = xmalloc(tlen);
535 prepare_length(0, &p, len);
536 prepare_message(0, &p);
537 assert(p == m + tlen);
539 ssize_t sr = fwrite(m, tlen, 1, call_sock);
540 if (sr != 1) diee("write request (buffer)");
542 if (fflush(call_sock)) diee("write request");
545 static FILE *call_sock_from_fd(int fd) {
548 FILE *call_sock = fdopen(fd, "r+");
549 if (!call_sock) diee("fdopen socket");
551 r = setvbuf(call_sock, 0, _IONBF, 0);
552 if (r) die("setvbuf socket");
557 static bool was_eof(FILE *call_sock) {
558 return feof(call_sock) || errno==ECONNRESET;
562 static int protocol_read_maybe(void *data, size_t sz) {
564 size_t sr = fread(data, sz, 1, call_sock);
566 if (was_eof(call_sock)) return -1;
567 diee("read() on monitor call socket (%zd)", sz);
572 static void protocol_read(void *data, size_t sz) {
573 if (protocol_read_maybe(data, sz) < 0)
574 die("monitor process quit unexpectedly");
577 // Returns 0 if OK, error msg if peer was garbage.
578 static const char *read_greeting(void) {
579 char got_magic[sizeof(header_magic)];
581 if (protocol_read_maybe(&got_magic, sizeof(got_magic)) < 0)
582 return "initial monitor process quit";
584 if (memcmp(got_magic, header_magic, sizeof(header_magic)))
585 die("got unexpected protocol magic 0x%02x%02x%02x%02x",
586 got_magic[0], got_magic[1], got_magic[2], got_magic[3]);
589 protocol_read(&xdata_len, sizeof(xdata_len));
590 void *xdata = xmalloc(xdata_len);
591 protocol_read(xdata, xdata_len);
596 // Returns: call(client-end), or 0 to mean "is garbage"
597 // find_socket_path must have been called
598 static FILE *connect_existing(void) {
602 fd = socket(AF_UNIX, SOCK_STREAM, 0);
603 if (fd==-1) diee("socket() for client");
605 socklen_t salen = sizeof(sockaddr_sun);
606 r = connect(fd, (const struct sockaddr*)&sockaddr_sun, salen);
608 if (errno==ECONNREFUSED || errno==ENOENT) goto x_garbage;
609 diee("connect() %s", socket_path);
612 call_sock = call_sock_from_fd(fd);
621 if (call_sock) { fclose(call_sock); call_sock=0; }
622 if (fd >= 0) close(fd);
626 static void watcher_cb_stdin(uv_poll_t *handle, int status, int events) {
630 if ((errno = -status)) diee("watcher: poll stdin");
634 if (!(errno==EINTR || errno==EWOULDBLOCK || errno==EAGAIN))
635 diee("watcher: read sentinel stdin");
639 static void watcher_cb_sockpath(uv_fs_event_t *handle, const char *filename,
640 int events, int status) {
642 struct stat now_stab;
644 if ((errno = -status)) diee("watcher: poll stdin");
646 r= stat(socket_path, &now_stab);
648 if (errno==ENOENT) _exit(0);
649 if (errno==EINTR) continue;
650 diee("stat socket: %s", socket_path);
652 if (!stabs_same_inode(&now_stab, &initial_stab))
657 // On entry, stderr is still inherited, but 0 and 1 are the pipes
658 static __attribute__((noreturn))
659 void become_watcher(void) {
661 uv_poll_t uvhandle_stdin;
662 uv_fs_event_t uvhandle_sockpath;
667 errno= -uv_loop_init(&loop);
668 if (errno) diee("watcher: uv_loop_init");
670 errno= -uv_poll_init(&loop, &uvhandle_stdin, 0);
671 if (errno) diee("watcher: uv_poll_init");
672 errno= -uv_poll_start(&uvhandle_stdin,
673 UV_READABLE | UV_WRITABLE | UV_DISCONNECT,
675 if (errno) diee("watcher: uv_poll_start");
677 errno= -uv_fs_event_init(&loop, &uvhandle_sockpath);
678 if (errno) diee("watcher: uv_fs_event_init");
680 errno= -uv_fs_event_start(&uvhandle_sockpath, watcher_cb_sockpath,
682 if (errno) diee("watcher: uv_fs_event_start");
684 // OK everything is set up, let us daemonise
685 if (dup2(1,2) != 2) diee("watcher: set daemonised stderr");
686 r= setvbuf(stderr, 0, _IOLBF, BUFSIZ);
687 if (r) diee("watcher: setvbuf stderr");
689 pid_t child = fork();
690 if (child == (pid_t)-1) diee("watcher: fork");
693 if (setsid() == (pid_t)-1) diee("watcher: setsid");
695 r= uv_run(&loop, UV_RUN_DEFAULT);
696 die("uv_run returned (%d)", r);
699 static __attribute__((noreturn))
700 void become_setup(int sfd, int lockfd, int fake_pair[2],
701 int watcher_stdin, int watcher_stderr) {
704 int call_fd = fake_pair[1];
706 int null_0 = open("/dev/null", O_RDONLY); if (null_0 < 0) diee("open null");
707 if (dup2(null_0, 0)) diee("dup2 /dev/null onto stdin");
709 if (dup2(2, 1) != 1) die("dup2 stderr onto stdout");
713 // Extension could work like this:
715 // We advertise a new protocol (perhaps one which is nearly entirely
716 // different after the connect) by putting a name for it comma-separated
717 // next to "v1". Simple extension can be done by having the script
718 // side say something about it in the ack xdata, which we currently ignore.
719 putenv(m_asprintf("PREFORK_INTERP=v1 %d,%d,%d,%d",
720 sfd, call_fd, watcher_stdin, watcher_stderr));
722 execvp(executor_argv[0], (char**)executor_argv);
723 diee("execute %s", executor_argv[0]);
726 static void connect_or_spawn(void) {
729 call_sock = connect_existing();
730 if (call_sock) return;
732 // We're going to make a new one, so clean out old ones
735 int lockfd = acquire_lock();
736 call_sock = connect_existing();
737 if (call_sock) { close(lockfd); return; }
739 // We must start a fresh one, and we hold the lock
741 r = unlink(socket_path);
742 if (r<0 && errno!=ENOENT)
743 diee("failed to remove stale socket %s", socket_path);
745 int sfd = socket(AF_UNIX, SOCK_STREAM, 0);
746 if (sfd<0) diee("socket() for new listener");
748 socklen_t salen = sizeof(sockaddr_sun);
749 r= bind(sfd, (const struct sockaddr*)&sockaddr_sun, salen);
750 if (r<0) diee("bind() on new listener");
752 r= stat(socket_path, &initial_stab);
753 if (r<0) diee("stat() fresh socket");
755 // We never want callers to get ECONNREFUSED. But:
756 // There is a race here: from my RTFM they may get ECONNREFUSED
757 // if they try between our bind() and listen(). But if they do, they'll
758 // acquire the lock (serialising with us) and retry, and then it will work.
759 r = listen(sfd, INT_MAX);
760 if (r<0) diee("listen() for new listener");
764 int watcher_stdin[2];
765 int watcher_stderr[2];
766 if (pipe(watcher_stdin) || pipe(watcher_stderr))
767 diee("pipe() for socket inode watcher");
769 pid_t watcher = fork();
770 if (watcher == (pid_t)-1) diee("fork for watcher");
774 close(watcher_stdin[1]);
775 close(watcher_stderr[0]);
776 if (dup2(watcher_stdin[0], 0) != 0 ||
777 dup2(watcher_stderr[1], 1) != 1)
778 diee("initial dup2() for watcher");
779 close(watcher_stdin[0]);
780 close(watcher_stderr[1]);
784 close(watcher_stdin[0]);
785 close(watcher_stderr[1]);
786 nonblock(watcher_stderr[0]);
791 r = socketpair(AF_UNIX, SOCK_STREAM, 0, fake_pair);
792 if (r<0) diee("socketpair() for fake initial connection");
794 pid_t setup_pid = fork();
795 if (setup_pid == (pid_t)-1) diee("fork for spawn setup");
796 if (!setup_pid) become_setup(sfd, lockfd, fake_pair,
797 watcher_stdin[1], watcher_stderr[0]);
801 call_sock = call_sock_from_fd(fake_pair[0]);
804 pid_t got = waitpid(setup_pid, &status, 0);
805 if (got == (pid_t)-1) diee("waitpid setup [%ld]", (long)setup_pid);
806 if (got != setup_pid) diee("waitpid setup [%ld] gave [%ld]!",
807 (long)setup_pid, (long)got);
808 if (status != 0) propagate_exit_status(status, "setup");
810 const char *emsg = read_greeting();
811 if (emsg) die("setup failed: %s", emsg);
817 static void make_executor_argv(const char *const *argv) {
818 switch (laundering) {
820 default: die("need -U (specifying unlaundered argument handling)");
824 #define EACH_NEW_ARG(EACH) { \
825 arg = interp; { EACH } \
826 if ((arg = script)) { EACH } \
827 const char *const *walk = argv; \
828 while ((arg = *walk++)) { EACH } \
832 EACH_NEW_ARG( (void)arg; count++; );
834 const char **out = calloc(count, sizeof(char*));
835 executor_argv = (const char* const*)out;
836 if (!executor_argv) diee("allocate for arguments");
838 EACH_NEW_ARG( *out++ = arg; );
842 int main(int argc_unused, const char *const *argv) {
849 // which ought to be passed on to the actual executor.
850 make_executor_argv(argv);
853 FILLZERO(sockaddr_sun);
854 sockaddr_sun.sun_family = AF_UNIX;
855 assert(strlen(socket_path) <= sizeof(sockaddr_sun.sun_path));
856 strncpy(sockaddr_sun.sun_path, socket_path, sizeof(sockaddr_sun.sun_path));
860 // We're committed now, send the request (or bail out)
864 protocol_read(&status, sizeof(status));
866 status = ntohl(status);
867 if (status > INT_MAX) die("status 0x%lx does not fit in an int",
868 (unsigned long)status);
870 propagate_exit_status(status, "invocation");