2 * "Interpreter" that you can put in #! like this
3 * #!/usr/bin/prefork-interp [<options>] <interpreter>
6 * prefork-interp [<option> ..] <interpreter> [<script> [<args> ...]]
7 * prefork-interp [<option>,..],<interpreter> <script> [<args> ...]
8 * prefork-interp '[<option> ..] <interpreter>' <script> [<args> ...]
10 * Options must specify argument laundering mode.
11 * Currently the only mode supported is:
12 * -U unlaundered: setup and executor both get all arguments and env vars
13 * ident covers only env vars specified with -E
14 * ident covers only arguments interpreter and (if present) script
21 State during service execution, process parentage and key fds
27 && session leader (daemon)
28 & process group leader
33 || listen watch-err/in
34 || call (accept) \ ,------2
35 || ,-----------------------------. SERVER -----0 WATCHER(C)
36 CLIENT 2--=fdpassed>=---------. \ || && | &&
37 (C) 1--=fdpassed>=---------. \ \ || inotify
38 0--=fdpassed>=---------. \ \ \ || sockpath
53 Control flow and causality
55 | - \ / process control flow
56 ... causes mediated by fds or other IPC etc.
57 && session leader (daemon)
58 & process group leader
66 attempt to connect, and read greeting
69 tidy up stale /run entries continue from send_fds, below
72 retry attempt to connect, and read greeting
75 create listening socket release lock
77 fork/daemonise continue from send_fds, below
78 | `------------------.
81 make "fake" initial call socketpair
83 fork/exec + + + + + + + + + + + + + + + + + + + + + + + + + + + +
84 | `------. Perl, application
87 | + script initialisation
88 | + | Perl, application
89 | + + + +| + + + + + + + + + + + + + + + + + + + + + + + +
90 | + | Perl, prefork-interp
91 | + identify fds from envirnment
97 | + fork for initial service
100 | + | SCRIPT [server] &&
102 | + | ** accept / event loop **
104 | + | accepted? \ \ \idle timeout?
106 | + | fork child \ \ \
107 | + | _________/ watch | |watch stderr eof?
109 | + SCRIPT [monitor] | | |
113 | + send greeting | | |
116 release lock | from read greeting success, above
117 | ________ | ________/
122 | + fork for executor
123 | + p |parent \child Perl, prefork-interp
124 | + | + + + + \+ + + + + + + + + + + + + + + +
125 | + | + execute service Perl, application
131 | + | + kernel closes execterm
138 | ,..,..send status +
142 Or, if client is killed
167 * client (C wrapper) connects to server
168 * (including reading ack byte)
169 * if fails or garbage
170 * === acquires lock ===
171 * makes new listening socket
172 * makes watcher pipes
173 * forks watcher and awaits
174 * makes first-instance socketpair
175 * forks setup (script, sock fds indicated in env)
176 * fd0, fd1, fd2: from-outer
177 * other fd: call(client-end)(fake)
178 * reaps setup (and reports error)
179 * (implicitly releases lock)
181 * watcher fd[012]: watcher pipes
182 * starts watch on socket path
183 * sets stderr to line buffered
184 * sets stdin to nonblocking
185 * daemonises (one fork, becomes session leader)
186 * when socket stat changes, quit
188 * setup (pre-exec) fd0: null,
189 * fd[12]: fd2-from-outer
190 * env fds: listener, call(server-end)(fake),
191 * watcher read, watcher write
193 * possibly clean env, argv
195 * setup (script) runs initialisation parts of the script
196 * at prefork establishment point:
197 * setup (pm) [1] opens syslog
201 * server (pm) [1] [fd0: null],
202 * [fd[12]: fd2-from-outer]
204 * right away, forks init monitor
205 * [2] closes outer caller fds and call(fake)
206 * [server (pm)] fd[012]: null
207 * other fds: listener, syslog
208 * runs in loop accepting and forking,
209 * reaping and limiting children (incl init monitor)
210 * reports failures of monitors to syslog
212 * [client (C wrapper)] if client connect succeeds:
213 * now fd: call(client-end)
214 * sends message with: cmdline, env
217 * [server (script)] accepts, forks subseq monitor
219 * monitor [1] [fd0: null]
220 * (init [fd[12]: init: fd2-from-outer; subseq: null]
221 * or errors: init: fd2; subseq: syslog
222 * subseq) other fds: syslog, call(server-end)
224 * receives args, env, fds
227 * executor sorts out fds:
228 * fd0, fd1, fd2: from-outer
229 * close fds: call(server-end)
230 * retained fds: syslog
233 * runs main part of script
236 * [monitor] [fd[012]: null]
237 * [fd[12]: init: fd2-from-outer; subseq: null]
238 * [errors: init: fd2; subseq: syslog]
240 * reports status via socket
242 * [client (C wrapper)] [fd0, fd1, fd2: from-outer]
243 * [other fd: call(client-end)]
244 * receives status, exits appropriately
245 * (if was bad signal, reports to stderr, exits 127)
248 #include <arpa/inet.h>
254 const char our_name[] = "prefork-interp";
256 static struct sockaddr_un sockaddr_sun;
257 static FILE *call_sock;
259 #define ACK_BYTE '\n'
261 static const char *const *executor_argv;
263 static const char header_magic[4] = "PFI\n";
265 void fusagemessage(FILE *f) {
266 fprintf(f, "usage: #!/usr/bin/prefork-interp [<options>]\n");
269 static int laundering;
270 static int max_sockets = 100; // maximum entries in the run dir is 2x this
272 static struct stat initial_stab;
274 const struct cmdinfo cmdinfos[]= {
276 { 0, 'U', 0, .iassignto= &laundering, .arg= 'U' },
280 void ident_addinit(void) {
281 char ident_magic[1] = { 0 };
282 sha256_update(&identsc, sizeof(ident_magic), ident_magic);
285 static void propagate_exit_status(int status, const char *what) {
288 if (WIFEXITED(status)) {
292 if (WIFSIGNALED(status)) {
293 int sig = WTERMSIG(status);
294 const char *signame = strsignal(sig);
295 if (signame == 0) signame = "unknown signal";
297 if (! WCOREDUMP(status) &&
304 sa.sa_handler = SIG_DFL;
305 r = sigaction(sig, &sa, 0);
306 if (r) diee("failed to reset signal handler while propagating %s",
311 sigaddset(&sset, sig);
312 r = sigprocmask(SIG_UNBLOCK, &sset, 0);
313 if (r) diee("failed to reset signal block while propagating %s",
317 die("unexpectedly kept running after raising (to propagate) %s",
321 die("%s failed due to signal %d %s%s", what, sig, signame,
322 WCOREDUMP(status) ? " (core dumped)" : "");
325 die("%s failed with weird wait status %d 0x%x", what, status, status);
333 static int preclean_entry_compar_name(const void *av, const void *bv) {
334 const PrecleanEntry *a = av;
335 const PrecleanEntry *b = bv;
336 return strcmp(a->name_hash, b->name_hash);
339 static int preclean_entry_compar_atime(const void *av, const void *bv) {
340 const PrecleanEntry *ae = av; time_t a = ae->atime;
341 const PrecleanEntry *be = bv; time_t b = be->atime;
346 static time_t preclean_stat_atime(const char *s_path) {
348 int r= lstat(s_path, &stab);
350 if (errno!=ENOENT) diee("pre-cleanup: stat socket (%s)", s_path);
353 return stab.st_atime;
356 static void preclean(void) {
357 DIR *dir = opendir(run_base);
359 if (errno == ENOENT) return;
360 diee("pre-cleanup: open run dir (%s)", run_base);
363 PrecleanEntry *entries=0;
364 size_t avail_entries=0;
365 size_t used_entries=0;
368 while ((errno = 0, de = readdir(dir))) {
369 char c0 = de->d_name[0];
370 if (!(c0 == 'l' || c0 == 's')) continue;
371 char *name_hash = m_asprintf("%s", de->d_name+1);
372 char *s_path = m_asprintf("%s/s%s", run_base, name_hash);
373 time_t atime = preclean_stat_atime(s_path);
375 if (avail_entries == used_entries) {
376 assert(avail_entries < INT_MAX / 4 / sizeof(PrecleanEntry));
379 entries = realloc(entries, avail_entries * sizeof(PrecleanEntry));
381 entries[used_entries].name_hash = name_hash;
382 entries[used_entries].atime = atime;
385 if (errno) diee("pre-cleanup: read run dir (%s)", run_base);
387 // First we dedupe (after sorting by path)
388 qsort(entries, used_entries, sizeof(PrecleanEntry),
389 preclean_entry_compar_name);
390 PrecleanEntry *p, *q;
391 for (p=entries, q=entries; p < entries + used_entries; p++) {
392 if (q > entries && !strcmp(p->name_hash, (q-1)->name_hash))
396 used_entries = q - entries;
398 // Now maybe delete some things
400 // Actually this has an off-by-one error since we are about
401 // to create a socket, so the actual number of sockets is one more.
402 // But, *actually*, since there might be multiple of us running at once,
403 // we might have even more than that. This doesn't really matter.
404 if (used_entries > max_sockets) {
405 qsort(entries, used_entries, sizeof(PrecleanEntry),
406 preclean_entry_compar_atime);
407 for (p=entries; p < entries + max_sockets; p++) {
408 char *l_path = m_asprintf("%s/l%s", run_base, p->name_hash);
409 char *s_path = m_asprintf("%s/s%s", run_base, p->name_hash);
410 int lock_fd = flock_file(l_path);
411 // Recheck atime - we might have raced!
412 time_t atime = preclean_stat_atime(s_path);
413 if (atime != p->atime) {
414 // Raced. This will leave use deleting too few things. Whatever.
416 int r= unlink(s_path);
417 if (r && errno!=ENOENT) diee("preclean: delete stale (%s)", s_path);
419 if (r) diee("preclean: delete stale lock (%s)", s_path);
420 // NB we don't hold the lock any more now.
428 for (p=entries; p < entries + used_entries; p++)
433 static __attribute((noreturn)) void die_data_overflow(void) {
434 die("cannot handle data with length >2^32");
437 static void prepare_data(size_t *len, char **buf,
438 const void *data, size_t dl) {
440 if (dl >= SIZE_MAX - *len)
445 memcpy(*buf, data, dl);
450 static void prepare_length(size_t *len, char **buf, size_t dl_sz) {
451 if (dl_sz > UINT32_MAX) die_data_overflow();
452 uint32_t dl = htonl(dl_sz);
453 prepare_data(len, buf, &dl, sizeof(dl));
456 static void prepare_string(size_t *len, char **buf, const char *s) {
457 size_t sl = strlen(s);
458 prepare_data(len, buf, s, sl+1);
461 static void prepare_message(size_t *len, char **buf) {
464 const char *const *p = (void*)environ;
467 prepare_string(len, buf, s);
470 prepare_string(len, buf, "");
474 prepare_string(len, buf, s);
477 static void send_fd(int payload_fd) {
478 int via_fd = fileno(call_sock);
481 struct cmsghdr align;
482 char buf[CMSG_SPACE(sizeof(payload_fd))];
492 iov.iov_base = &dummy_byte;
498 msg.msg_control = cmsg_buf.buf;
499 msg.msg_controllen = sizeof(cmsg_buf.buf);
501 struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
502 cmsg->cmsg_level = SOL_SOCKET;
503 cmsg->cmsg_type = SCM_RIGHTS;
504 cmsg->cmsg_len = CMSG_LEN(sizeof(payload_fd));
505 *(int*)CMSG_DATA(cmsg) = payload_fd;
507 msg.msg_controllen = sizeof(cmsg_buf.buf);
510 ssize_t r = sendmsg(via_fd, &msg, 0);
512 if (errno == EINTR) continue;
520 static void send_request(void) {
521 // Sending these first makes it easier for the script to
522 // use buffered IO for the message.
528 prepare_message(&len, 0);
530 size_t tlen = len + 4;
531 char *m = xmalloc(tlen);
533 prepare_length(0, &p, len);
534 prepare_message(0, &p);
535 assert(p == m + tlen);
537 ssize_t sr = fwrite(m, tlen, 1, call_sock);
538 if (sr != 1) diee("write request (buffer)");
540 if (fflush(call_sock)) diee("write request");
543 static FILE *call_sock_from_fd(int fd) {
546 FILE *call_sock = fdopen(fd, "r+");
547 if (!call_sock) diee("fdopen socket");
549 r = setvbuf(call_sock, 0, _IONBF, 0);
550 if (r) die("setvbuf socket");
555 static bool was_eof(FILE *call_sock) {
556 return feof(call_sock) || errno==ECONNRESET;
560 static int protocol_read_maybe(void *data, size_t sz) {
562 size_t sr = fread(data, sz, 1, call_sock);
564 if (was_eof(call_sock)) return -1;
565 diee("read() on monitor call socket (%zd)", sz);
570 static void protocol_read(void *data, size_t sz) {
571 if (protocol_read_maybe(data, sz) < 0)
572 die("monitor process quit unexpectedly");
575 // Returns 0 if OK, error msg if peer was garbage.
576 static const char *read_greeting(void) {
577 char got_magic[sizeof(header_magic)];
579 if (protocol_read_maybe(&got_magic, sizeof(got_magic)) < 0)
580 return "initial monitor process quit";
582 if (memcmp(got_magic, header_magic, sizeof(header_magic)))
583 die("got unexpected protocol magic 0x%02x%02x%02x%02x",
584 got_magic[0], got_magic[1], got_magic[2], got_magic[3]);
587 protocol_read(&xdata_len, sizeof(xdata_len));
588 void *xdata = xmalloc(xdata_len);
589 protocol_read(xdata, xdata_len);
594 // Returns: call(client-end), or 0 to mean "is garbage"
595 // find_socket_path must have been called
596 static FILE *connect_existing(void) {
600 fd = socket(AF_UNIX, SOCK_STREAM, 0);
601 if (fd==-1) diee("socket() for client");
603 socklen_t salen = sizeof(sockaddr_sun);
604 r = connect(fd, (const struct sockaddr*)&sockaddr_sun, salen);
606 if (errno==ECONNREFUSED || errno==ENOENT) goto x_garbage;
607 diee("connect() %s", socket_path);
610 call_sock = call_sock_from_fd(fd);
619 if (call_sock) { fclose(call_sock); call_sock=0; }
620 if (fd >= 0) close(fd);
624 static void watcher_cb_stdin(uv_poll_t *handle, int status, int events) {
628 if ((errno = -status)) diee("watcher: poll stdin");
632 if (!(errno==EINTR || errno==EWOULDBLOCK || errno==EAGAIN))
633 diee("watcher: read sentinel stdin");
637 static void watcher_cb_sockpath(uv_fs_event_t *handle, const char *filename,
638 int events, int status) {
640 struct stat now_stab;
642 if ((errno = -status)) diee("watcher: poll stdin");
644 r= stat(socket_path, &now_stab);
646 if (errno==ENOENT) _exit(0);
647 if (errno==EINTR) continue;
648 diee("stat socket: %s", socket_path);
650 if (!stabs_same_inode(&now_stab, &initial_stab))
655 // On entry, stderr is still inherited, but 0 and 1 are the pipes
656 static __attribute__((noreturn))
657 void become_watcher(void) {
659 uv_poll_t uvhandle_stdin;
660 uv_fs_event_t uvhandle_sockpath;
665 errno= -uv_loop_init(&loop);
666 if (errno) diee("watcher: uv_loop_init");
668 errno= -uv_poll_init(&loop, &uvhandle_stdin, 0);
669 if (errno) diee("watcher: uv_poll_init");
670 errno= -uv_poll_start(&uvhandle_stdin,
671 UV_READABLE | UV_WRITABLE | UV_DISCONNECT,
673 if (errno) diee("watcher: uv_poll_start");
675 errno= -uv_fs_event_init(&loop, &uvhandle_sockpath);
676 if (errno) diee("watcher: uv_fs_event_init");
678 errno= -uv_fs_event_start(&uvhandle_sockpath, watcher_cb_sockpath,
680 if (errno) diee("watcher: uv_fs_event_start");
682 // OK everything is set up, let us daemonise
683 if (dup2(1,2) != 2) diee("watcher: set daemonised stderr");
684 r= setvbuf(stderr, 0, _IOLBF, BUFSIZ);
685 if (r) diee("watcher: setvbuf stderr");
687 pid_t child = fork();
688 if (child == (pid_t)-1) diee("watcher: fork");
691 if (setsid() == (pid_t)-1) diee("watcher: setsid");
693 r= uv_run(&loop, UV_RUN_DEFAULT);
694 die("uv_run returned (%d)", r);
697 static __attribute__((noreturn))
698 void become_setup(int sfd, int fake_pair[2],
699 int watcher_stdin, int watcher_stderr) {
701 int call_fd = fake_pair[1];
703 int null_0 = open("/dev/null", O_RDONLY); if (null_0 < 0) diee("open null");
704 if (dup2(null_0, 0)) diee("dup2 /dev/null onto stdin");
706 if (dup2(2, 1) != 1) die("dup2 stderr onto stdout");
710 // Extension could work like this:
712 // We advertise a new protocol (perhaps one which is nearly entirely
713 // different after the connect) by putting a name for it comma-separated
714 // next to "v1". Simple extension can be done by having the script
715 // side say something about it in the ack xdata, which we currently ignore.
716 putenv(m_asprintf("PREFORK_INTERP=v1 %d,%d,%d,%d",
717 sfd, call_fd, watcher_stdin, watcher_stderr));
719 execvp(executor_argv[0], (char**)executor_argv);
720 diee("execute %s", executor_argv[0]);
723 static void connect_or_spawn(void) {
726 call_sock = connect_existing();
727 if (call_sock) return;
729 // We're going to make a new one, so clean out old ones
732 int lockfd = acquire_lock();
733 call_sock = connect_existing();
734 if (call_sock) { close(lockfd); return; }
736 // We must start a fresh one, and we hold the lock
738 r = unlink(socket_path);
739 if (r<0 && errno!=ENOENT)
740 diee("failed to remove stale socket %s", socket_path);
742 int sfd = socket(AF_UNIX, SOCK_STREAM, 0);
743 if (sfd<0) diee("socket() for new listener");
745 socklen_t salen = sizeof(sockaddr_sun);
746 r= bind(sfd, (const struct sockaddr*)&sockaddr_sun, salen);
747 if (r<0) diee("bind() on new listener");
749 r= stat(socket_path, &initial_stab);
750 if (r<0) diee("stat() fresh socket");
752 // We never want callers to get ECONNREFUSED. But:
753 // There is a race here: from my RTFM they may get ECONNREFUSED
754 // if they try between our bind() and listen(). But if they do, they'll
755 // acquire the lock (serialising with us) and retry, and then it will work.
756 r = listen(sfd, INT_MAX);
757 if (r<0) diee("listen() for new listener");
761 int watcher_stdin[2];
762 int watcher_stderr[2];
763 if (pipe(watcher_stdin) || pipe(watcher_stderr))
764 diee("pipe() for socket inode watcher");
766 pid_t watcher = fork();
767 if (watcher == (pid_t)-1) diee("fork for watcher");
771 close(watcher_stdin[1]);
772 close(watcher_stderr[0]);
773 if (dup2(watcher_stdin[0], 0) != 0 ||
774 dup2(watcher_stderr[1], 1) != 1)
775 diee("initial dup2() for watcher");
776 close(watcher_stdin[0]);
777 close(watcher_stderr[1]);
781 close(watcher_stdin[0]);
782 close(watcher_stderr[1]);
783 nonblock(watcher_stderr[0]);
788 r = socketpair(AF_UNIX, SOCK_STREAM, 0, fake_pair);
789 if (r<0) diee("socketpair() for fake initial connection");
791 pid_t setup_pid = fork();
792 if (setup_pid == (pid_t)-1) diee("fork for spawn setup");
793 if (!setup_pid) become_setup(sfd, fake_pair,
794 watcher_stdin[1], watcher_stderr[0]);
798 call_sock = call_sock_from_fd(fake_pair[0]);
801 pid_t got = waitpid(setup_pid, &status, 0);
802 if (got == (pid_t)-1) diee("waitpid setup [%ld]", (long)setup_pid);
803 if (got != setup_pid) diee("waitpid setup [%ld] gave [%ld]!",
804 (long)setup_pid, (long)got);
805 if (status != 0) propagate_exit_status(status, "setup");
807 const char *emsg = read_greeting();
808 if (emsg) die("setup failed: %s", emsg);
814 static void make_executor_argv(const char *const *argv) {
815 switch (laundering) {
817 default: die("need -U (specifying unlaundered argument handling)");
821 #define EACH_NEW_ARG(EACH) { \
822 arg = interp; { EACH } \
823 if ((arg = script)) { EACH } \
824 const char *const *walk = argv; \
825 while ((arg = *walk++)) { EACH } \
829 EACH_NEW_ARG( (void)arg; count++; );
831 const char **out = calloc(count, sizeof(char*));
832 executor_argv = (const char* const*)out;
833 if (!executor_argv) diee("allocate for arguments");
835 EACH_NEW_ARG( *out++ = arg; );
839 int main(int argc_unused, const char *const *argv) {
846 // which ought to be passed on to the actual executor.
847 make_executor_argv(argv);
850 FILLZERO(sockaddr_sun);
851 sockaddr_sun.sun_family = AF_UNIX;
852 assert(strlen(socket_path) <= sizeof(sockaddr_sun.sun_path));
853 strncpy(sockaddr_sun.sun_path, socket_path, sizeof(sockaddr_sun.sun_path));
857 // We're committed now, send the request (or bail out)
861 protocol_read(&status, sizeof(status));
863 status = ntohl(status);
864 if (status > INT_MAX) die("status 0x%lx does not fit in an int",
865 (unsigned long)status);
867 propagate_exit_status(status, "invocation");