2 * "Interpreter" that you can put in #! like this
3 * #!/usr/bin/prefork-interp [<options>] <interpreter>
6 * prefork-interp [<option> ..] <interpreter> [<script> [<args> ...]]
7 * prefork-interp [<option>,..],<interpreter> <script> [<args> ...]
8 * prefork-interp '[<option> ..] <interpreter>' <script> [<args> ...]
10 * Options must specify argument laundering mode.
11 * Currently the only mode supported is:
12 * -U unlaundered: setup and executor both get all arguments and env vars
13 * ident covers only env vars specified with -E
14 * ident covers only arguments interpreter and (if present) script
18 ***************************************************************************
20 State during service execution, process parentage and key fds
25 || listen watch-err/in
26 || call (accept) \ ,------2
27 || ,-----------------------------. SERVER -----0 WATCHER(C)
28 CLIENT 2--=fdpassed>=---------. \ || && | &&
29 (C) 1--=fdpassed>=---------. \ \ || inotify
30 0--=fdpassed>=---------. \ \ \ || sockpath
48 && session leader (daemon)
49 & process group leader
51 ***************************************************************************
53 Control flow and causality
61 attempt to connect, and read greeting
64 tidy up stale /run entries *1 (continue from send_fds, below)
67 retry attempt to connect, and read greeting
70 create listening socket release lock
73 | `------------------.
76 make "fake" initial call socketpair (C)
78 fork/exec #########################################################
79 | `-------------. application
82 | # script initialisation
84 | # ########|#############################################
86 | # identify fds from envirnment (Perl)
91 waitpid # fork for initial service
94 | # | SCRIPT [server] &&
96 | # | ** accept / event loop **
98 | # | / \ watch\ \idle
99 | # | fork child \stderr\ \timeout?
100 | # | _________/ | | |
102 | # SCRIPT [monitor] | eof?| |
105 read ,....<.....send greeting | | |
106 greeting # | ___________________
114 | # fork for executor (Perl)
115 | # |parent? \child? prefork-interp
116 | # | ######\############################
117 | # | # SCRIPT (executor) application
118 | # | # execute service
119 | # wait for read # |
120 | # (select) # terminates
122 | # | # kernel closes execterm
123 | # | ,......<....../|
126 | # | | ,......<...../
127 | # waitpid # _______________
129 | ,....<....,..send status #
130 read status # ________________ #
134 ********** Or, if client is killed **********
136 | # | # execute service
137 terminates # wait for read # |
142 _____________ # \|call? # |
144 # kill whole pgrp... # killled
146 # | | ,......<....../
147 # waitpid # _______________
150 # _____SIGPIPE______ #
152 | - \ / process control flow
153 ... < > causes mediated by fds or other IPC etc.
154 && session leader (daemon)
155 & process group leader
156 # language/implementation boundary
157 *1 line continued elsewhere
159 ______ process termination (after reaping, if shown)
161 ***************************************************************************
163 Sequence of events and fd pluming.
164 NB INCOMPLETE - does not cover execterm, cleanup
166 client (C wrapper) connects to server
167 (including reading ack byte)
169 === acquires lock ===
170 makes new listening socket
172 forks watcher and awaits
173 makes first-instance socketpair
174 forks setup (script, sock fds indicated in env)
175 fd0, fd1, fd2: from-outer
176 other fd: call(client-end)(fake)
177 reaps setup (and reports error)
178 (implicitly releases lock)
180 watcher fd[012]: watcher pipes
181 starts watch on socket path
182 sets stderr to line buffered
183 sets stdin to nonblocking
184 daemonises (one fork, becomes session leader)
185 when socket stat changes, quit
187 setup (pre-exec) fd0: null,
188 fd[12]: fd2-from-outer
189 env fds: listener, call(server-end)(fake),
190 watcher read, watcher write
192 possibly clean env, argv
194 setup (script) runs initialisation parts of the script
195 at prefork establishment point:
196 setup (pm) [1] opens syslog
200 server (pm) [1] [fd0: null],
201 [fd[12]: fd2-from-outer]
203 right away, forks init monitor
204 [2] closes outer caller fds and call(fake)
205 [server (pm)] fd[012]: null
206 other fds: listener, syslog
207 runs in loop accepting and forking,
208 reaping and limiting children (incl init monitor)
209 reports failures of monitors to syslog
211 [client (C wrapper)] if client connect succeeds:
212 now fd: call(client-end)
213 sends message with: cmdline, env
216 [server (script)] accepts, forks subseq monitor
218 monitor [1] [fd0: null]
219 (init [fd[12]: init: fd2-from-outer; subseq: null]
220 or errors: init: fd2; subseq: syslog
221 subseq) other fds: syslog, call(server-end)
223 receives args, env, fds
226 executor sorts out fds:
227 fd0, fd1, fd2: from-outer
228 close fds: call(server-end)
232 runs main part of script
235 [monitor] [fd[012]: null]
236 [fd[12]: init: fd2-from-outer; subseq: null]
237 [errors: init: fd2; subseq: syslog]
239 reports status via socket
241 [client (C wrapper)] [fd0, fd1, fd2: from-outer]
242 [other fd: call(client-end)]
243 receives status, exits appropriately
244 (if was bad signal, reports to stderr, exits 127)
246 ***************************************************************************
250 #include <arpa/inet.h>
256 const char our_name[] = "prefork-interp";
258 static struct sockaddr_un sockaddr_sun;
259 static FILE *call_sock;
261 #define ACK_BYTE '\n'
263 static const char *const *executor_argv;
265 static const char header_magic[4] = "PFI\n";
267 void fusagemessage(FILE *f) {
268 fprintf(f, "usage: #!/usr/bin/prefork-interp [<options>]\n");
271 static int laundering;
272 static int max_sockets = 100; // maximum entries in the run dir is 2x this
274 static struct stat initial_stab;
276 const struct cmdinfo cmdinfos[]= {
278 { 0, 'U', 0, .iassignto= &laundering, .arg= 'U' },
282 void ident_addinit(void) {
283 char ident_magic[1] = { 0 };
284 sha256_update(&identsc, sizeof(ident_magic), ident_magic);
287 static void propagate_exit_status(int status, const char *what) {
290 if (WIFEXITED(status)) {
294 if (WIFSIGNALED(status)) {
295 int sig = WTERMSIG(status);
296 const char *signame = strsignal(sig);
297 if (signame == 0) signame = "unknown signal";
299 if (! WCOREDUMP(status) &&
306 sa.sa_handler = SIG_DFL;
307 r = sigaction(sig, &sa, 0);
308 if (r) diee("failed to reset signal handler while propagating %s",
313 sigaddset(&sset, sig);
314 r = sigprocmask(SIG_UNBLOCK, &sset, 0);
315 if (r) diee("failed to reset signal block while propagating %s",
319 die("unexpectedly kept running after raising (to propagate) %s",
323 die("%s failed due to signal %d %s%s", what, sig, signame,
324 WCOREDUMP(status) ? " (core dumped)" : "");
327 die("%s failed with weird wait status %d 0x%x", what, status, status);
335 static int preclean_entry_compar_name(const void *av, const void *bv) {
336 const PrecleanEntry *a = av;
337 const PrecleanEntry *b = bv;
338 return strcmp(a->name_hash, b->name_hash);
341 static int preclean_entry_compar_atime(const void *av, const void *bv) {
342 const PrecleanEntry *ae = av; time_t a = ae->atime;
343 const PrecleanEntry *be = bv; time_t b = be->atime;
348 static time_t preclean_stat_atime(const char *s_path) {
350 int r= lstat(s_path, &stab);
352 if (errno!=ENOENT) diee("pre-cleanup: stat socket (%s)", s_path);
355 return stab.st_atime;
358 static void preclean(void) {
359 DIR *dir = opendir(run_base);
361 if (errno == ENOENT) return;
362 diee("pre-cleanup: open run dir (%s)", run_base);
365 PrecleanEntry *entries=0;
366 size_t avail_entries=0;
367 size_t used_entries=0;
370 while ((errno = 0, de = readdir(dir))) {
371 char c0 = de->d_name[0];
372 if (!(c0 == 'l' || c0 == 's')) continue;
373 char *name_hash = m_asprintf("%s", de->d_name+1);
374 char *s_path = m_asprintf("%s/s%s", run_base, name_hash);
375 time_t atime = preclean_stat_atime(s_path);
377 if (avail_entries == used_entries) {
378 assert(avail_entries < INT_MAX / 4 / sizeof(PrecleanEntry));
381 entries = realloc(entries, avail_entries * sizeof(PrecleanEntry));
383 entries[used_entries].name_hash = name_hash;
384 entries[used_entries].atime = atime;
387 if (errno) diee("pre-cleanup: read run dir (%s)", run_base);
389 // First we dedupe (after sorting by path)
390 qsort(entries, used_entries, sizeof(PrecleanEntry),
391 preclean_entry_compar_name);
392 PrecleanEntry *p, *q;
393 for (p=entries, q=entries; p < entries + used_entries; p++) {
394 if (q > entries && !strcmp(p->name_hash, (q-1)->name_hash))
398 used_entries = q - entries;
400 // Now maybe delete some things
402 // Actually this has an off-by-one error since we are about
403 // to create a socket, so the actual number of sockets is one more.
404 // But, *actually*, since there might be multiple of us running at once,
405 // we might have even more than that. This doesn't really matter.
406 if (used_entries > max_sockets) {
407 qsort(entries, used_entries, sizeof(PrecleanEntry),
408 preclean_entry_compar_atime);
409 for (p=entries; p < entries + max_sockets; p++) {
410 char *l_path = m_asprintf("%s/l%s", run_base, p->name_hash);
411 char *s_path = m_asprintf("%s/s%s", run_base, p->name_hash);
412 int lock_fd = flock_file(l_path);
413 // Recheck atime - we might have raced!
414 time_t atime = preclean_stat_atime(s_path);
415 if (atime != p->atime) {
416 // Raced. This will leave use deleting too few things. Whatever.
418 int r= unlink(s_path);
419 if (r && errno!=ENOENT) diee("preclean: delete stale (%s)", s_path);
421 if (r) diee("preclean: delete stale lock (%s)", s_path);
422 // NB we don't hold the lock any more now.
430 for (p=entries; p < entries + used_entries; p++)
435 static __attribute((noreturn)) void die_data_overflow(void) {
436 die("cannot handle data with length >2^32");
439 static void prepare_data(size_t *len, char **buf,
440 const void *data, size_t dl) {
442 if (dl >= SIZE_MAX - *len)
447 memcpy(*buf, data, dl);
452 static void prepare_length(size_t *len, char **buf, size_t dl_sz) {
453 if (dl_sz > UINT32_MAX) die_data_overflow();
454 uint32_t dl = htonl(dl_sz);
455 prepare_data(len, buf, &dl, sizeof(dl));
458 static void prepare_string(size_t *len, char **buf, const char *s) {
459 size_t sl = strlen(s);
460 prepare_data(len, buf, s, sl+1);
463 static void prepare_message(size_t *len, char **buf) {
466 const char *const *p = (void*)environ;
469 prepare_string(len, buf, s);
472 prepare_string(len, buf, "");
476 prepare_string(len, buf, s);
479 static void send_fd(int payload_fd) {
480 int via_fd = fileno(call_sock);
483 struct cmsghdr align;
484 char buf[CMSG_SPACE(sizeof(payload_fd))];
494 iov.iov_base = &dummy_byte;
500 msg.msg_control = cmsg_buf.buf;
501 msg.msg_controllen = sizeof(cmsg_buf.buf);
503 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
504 cmsg->cmsg_level = SOL_SOCKET;
505 cmsg->cmsg_type = SCM_RIGHTS;
506 cmsg->cmsg_len = CMSG_LEN(sizeof(payload_fd));
507 *(int*)CMSG_DATA(cmsg) = payload_fd;
509 msg.msg_controllen = sizeof(cmsg_buf.buf);
512 ssize_t r = sendmsg(via_fd, &msg, 0);
514 if (errno == EINTR) continue;
522 static void send_request(void) {
523 // Sending these first makes it easier for the script to
524 // use buffered IO for the message.
530 prepare_message(&len, 0);
532 size_t tlen = len + 4;
533 char *m = xmalloc(tlen);
535 prepare_length(0, &p, len);
536 prepare_message(0, &p);
537 assert(p == m + tlen);
539 ssize_t sr = fwrite(m, tlen, 1, call_sock);
540 if (sr != 1) diee("write request (buffer)");
542 if (fflush(call_sock)) diee("write request");
545 static FILE *call_sock_from_fd(int fd) {
548 FILE *call_sock = fdopen(fd, "r+");
549 if (!call_sock) diee("fdopen socket");
551 r = setvbuf(call_sock, 0, _IONBF, 0);
552 if (r) die("setvbuf socket");
557 static bool was_eof(FILE *call_sock) {
558 return feof(call_sock) || errno==ECONNRESET;
562 static int protocol_read_maybe(void *data, size_t sz) {
564 size_t sr = fread(data, sz, 1, call_sock);
566 if (was_eof(call_sock)) return -1;
567 diee("read() on monitor call socket (%zd)", sz);
572 static void protocol_read(void *data, size_t sz) {
573 if (protocol_read_maybe(data, sz) < 0)
574 die("monitor process quit unexpectedly");
577 // Returns 0 if OK, error msg if peer was garbage.
578 static const char *read_greeting(void) {
579 char got_magic[sizeof(header_magic)];
581 if (protocol_read_maybe(&got_magic, sizeof(got_magic)) < 0)
582 return "initial monitor process quit";
584 if (memcmp(got_magic, header_magic, sizeof(header_magic)))
585 die("got unexpected protocol magic 0x%02x%02x%02x%02x",
586 got_magic[0], got_magic[1], got_magic[2], got_magic[3]);
589 protocol_read(&xdata_len, sizeof(xdata_len));
590 void *xdata = xmalloc(xdata_len);
591 protocol_read(xdata, xdata_len);
596 // Returns: call(client-end), or 0 to mean "is garbage"
597 // find_socket_path must have been called
598 static FILE *connect_existing(void) {
602 fd = socket(AF_UNIX, SOCK_STREAM, 0);
603 if (fd==-1) diee("socket() for client");
605 socklen_t salen = sizeof(sockaddr_sun);
606 r = connect(fd, (const struct sockaddr*)&sockaddr_sun, salen);
608 if (errno==ECONNREFUSED || errno==ENOENT) goto x_garbage;
609 diee("connect() %s", socket_path);
612 call_sock = call_sock_from_fd(fd);
621 if (call_sock) { fclose(call_sock); call_sock=0; }
622 if (fd >= 0) close(fd);
626 static void watcher_cb_stdin(uv_poll_t *handle, int status, int events) {
630 if ((errno = -status)) diee("watcher: poll stdin");
634 if (!(errno==EINTR || errno==EWOULDBLOCK || errno==EAGAIN))
635 diee("watcher: read sentinel stdin");
639 static void watcher_cb_sockpath(uv_fs_event_t *handle, const char *filename,
640 int events, int status) {
642 struct stat now_stab;
644 if ((errno = -status)) diee("watcher: poll stdin");
646 r= stat(socket_path, &now_stab);
648 if (errno==ENOENT) _exit(0);
649 if (errno==EINTR) continue;
650 diee("stat socket: %s", socket_path);
652 if (!stabs_same_inode(&now_stab, &initial_stab))
657 // On entry, stderr is still inherited, but 0 and 1 are the pipes
658 static __attribute__((noreturn))
659 void become_watcher(void) {
661 uv_poll_t uvhandle_stdin;
662 uv_fs_event_t uvhandle_sockpath;
667 errno= -uv_loop_init(&loop);
668 if (errno) diee("watcher: uv_loop_init");
670 errno= -uv_poll_init(&loop, &uvhandle_stdin, 0);
671 if (errno) diee("watcher: uv_poll_init");
672 errno= -uv_poll_start(&uvhandle_stdin,
673 UV_READABLE | UV_WRITABLE | UV_DISCONNECT,
675 if (errno) diee("watcher: uv_poll_start");
677 errno= -uv_fs_event_init(&loop, &uvhandle_sockpath);
678 if (errno) diee("watcher: uv_fs_event_init");
680 errno= -uv_fs_event_start(&uvhandle_sockpath, watcher_cb_sockpath,
682 if (errno) diee("watcher: uv_fs_event_start");
684 // OK everything is set up, let us daemonise
685 if (dup2(1,2) != 2) diee("watcher: set daemonised stderr");
686 r= setvbuf(stderr, 0, _IOLBF, BUFSIZ);
687 if (r) diee("watcher: setvbuf stderr");
689 pid_t child = fork();
690 if (child == (pid_t)-1) diee("watcher: fork");
693 if (setsid() == (pid_t)-1) diee("watcher: setsid");
695 r= uv_run(&loop, UV_RUN_DEFAULT);
696 die("uv_run returned (%d)", r);
699 static __attribute__((noreturn))
700 void become_setup(int sfd, int fake_pair[2],
701 int watcher_stdin, int watcher_stderr) {
703 int call_fd = fake_pair[1];
705 int null_0 = open("/dev/null", O_RDONLY); if (null_0 < 0) diee("open null");
706 if (dup2(null_0, 0)) diee("dup2 /dev/null onto stdin");
708 if (dup2(2, 1) != 1) die("dup2 stderr onto stdout");
712 // Extension could work like this:
714 // We advertise a new protocol (perhaps one which is nearly entirely
715 // different after the connect) by putting a name for it comma-separated
716 // next to "v1". Simple extension can be done by having the script
717 // side say something about it in the ack xdata, which we currently ignore.
718 putenv(m_asprintf("PREFORK_INTERP=v1 %d,%d,%d,%d",
719 sfd, call_fd, watcher_stdin, watcher_stderr));
721 execvp(executor_argv[0], (char**)executor_argv);
722 diee("execute %s", executor_argv[0]);
725 static void connect_or_spawn(void) {
728 call_sock = connect_existing();
729 if (call_sock) return;
731 // We're going to make a new one, so clean out old ones
734 int lockfd = acquire_lock();
735 call_sock = connect_existing();
736 if (call_sock) { close(lockfd); return; }
738 // We must start a fresh one, and we hold the lock
740 r = unlink(socket_path);
741 if (r<0 && errno!=ENOENT)
742 diee("failed to remove stale socket %s", socket_path);
744 int sfd = socket(AF_UNIX, SOCK_STREAM, 0);
745 if (sfd<0) diee("socket() for new listener");
747 socklen_t salen = sizeof(sockaddr_sun);
748 r= bind(sfd, (const struct sockaddr*)&sockaddr_sun, salen);
749 if (r<0) diee("bind() on new listener");
751 r= stat(socket_path, &initial_stab);
752 if (r<0) diee("stat() fresh socket");
754 // We never want callers to get ECONNREFUSED. But:
755 // There is a race here: from my RTFM they may get ECONNREFUSED
756 // if they try between our bind() and listen(). But if they do, they'll
757 // acquire the lock (serialising with us) and retry, and then it will work.
758 r = listen(sfd, INT_MAX);
759 if (r<0) diee("listen() for new listener");
763 int watcher_stdin[2];
764 int watcher_stderr[2];
765 if (pipe(watcher_stdin) || pipe(watcher_stderr))
766 diee("pipe() for socket inode watcher");
768 pid_t watcher = fork();
769 if (watcher == (pid_t)-1) diee("fork for watcher");
773 close(watcher_stdin[1]);
774 close(watcher_stderr[0]);
775 if (dup2(watcher_stdin[0], 0) != 0 ||
776 dup2(watcher_stderr[1], 1) != 1)
777 diee("initial dup2() for watcher");
778 close(watcher_stdin[0]);
779 close(watcher_stderr[1]);
783 close(watcher_stdin[0]);
784 close(watcher_stderr[1]);
785 nonblock(watcher_stderr[0]);
790 r = socketpair(AF_UNIX, SOCK_STREAM, 0, fake_pair);
791 if (r<0) diee("socketpair() for fake initial connection");
793 pid_t setup_pid = fork();
794 if (setup_pid == (pid_t)-1) diee("fork for spawn setup");
795 if (!setup_pid) become_setup(sfd, fake_pair,
796 watcher_stdin[1], watcher_stderr[0]);
800 call_sock = call_sock_from_fd(fake_pair[0]);
803 pid_t got = waitpid(setup_pid, &status, 0);
804 if (got == (pid_t)-1) diee("waitpid setup [%ld]", (long)setup_pid);
805 if (got != setup_pid) diee("waitpid setup [%ld] gave [%ld]!",
806 (long)setup_pid, (long)got);
807 if (status != 0) propagate_exit_status(status, "setup");
809 const char *emsg = read_greeting();
810 if (emsg) die("setup failed: %s", emsg);
816 static void make_executor_argv(const char *const *argv) {
817 switch (laundering) {
819 default: die("need -U (specifying unlaundered argument handling)");
823 #define EACH_NEW_ARG(EACH) { \
824 arg = interp; { EACH } \
825 if ((arg = script)) { EACH } \
826 const char *const *walk = argv; \
827 while ((arg = *walk++)) { EACH } \
831 EACH_NEW_ARG( (void)arg; count++; );
833 const char **out = calloc(count, sizeof(char*));
834 executor_argv = (const char* const*)out;
835 if (!executor_argv) diee("allocate for arguments");
837 EACH_NEW_ARG( *out++ = arg; );
841 int main(int argc_unused, const char *const *argv) {
848 // which ought to be passed on to the actual executor.
849 make_executor_argv(argv);
852 FILLZERO(sockaddr_sun);
853 sockaddr_sun.sun_family = AF_UNIX;
854 assert(strlen(socket_path) <= sizeof(sockaddr_sun.sun_path));
855 strncpy(sockaddr_sun.sun_path, socket_path, sizeof(sockaddr_sun.sun_path));
859 // We're committed now, send the request (or bail out)
863 protocol_read(&status, sizeof(status));
865 status = ntohl(status);
866 if (status > INT_MAX) die("status 0x%lx does not fit in an int",
867 (unsigned long)status);
869 propagate_exit_status(status, "invocation");