chiark / gitweb /
nspawn: use Barrier API instead of eventfd-util
authorDavid Herrmann <dh.herrmann@gmail.com>
Sun, 13 Jul 2014 10:14:45 +0000 (12:14 +0200)
committerDavid Herrmann <dh.herrmann@gmail.com>
Thu, 17 Jul 2014 09:34:25 +0000 (11:34 +0200)
The Barrier-API simplifies cross-fork() synchronization a lot. Replace the
hard-coded eventfd-util implementation and drop it.

Compared to the old API, Barriers also handle exit() of the remote side as
abortion. This way, segfaults will not cause the parent to deadlock.

EINTR handling is currently ignored for any barrier-waits. This can easily
be added, but it isn't needed so far so I dropped it. EINTR handling in
general is ugly, anyway. You need to deal with pselect/ppoll/... variants
and make sure not to unblock signals at the wrong times. So genrally,
there's little use in adding it.

Makefile.am
src/nspawn/nspawn.c
src/shared/eventfd-util.c [deleted file]
src/shared/eventfd-util.h [deleted file]

index f0d80ba..fe680b0 100644 (file)
@@ -845,8 +845,6 @@ libsystemd_shared_la_SOURCES = \
        src/shared/barrier.h \
        src/shared/async.c \
        src/shared/async.h \
-       src/shared/eventfd-util.c \
-       src/shared/eventfd-util.h \
        src/shared/copy.c \
        src/shared/copy.h \
        src/shared/base-filesystem.c \
index bad93a5..e75cc28 100644 (file)
@@ -40,7 +40,6 @@
 #include <sys/un.h>
 #include <sys/socket.h>
 #include <linux/netlink.h>
-#include <sys/eventfd.h>
 #include <net/if.h>
 #include <linux/veth.h>
 #include <sys/personality.h>
 #include "def.h"
 #include "rtnl-util.h"
 #include "udev-util.h"
-#include "eventfd-util.h"
 #include "blkid-util.h"
 #include "gpt.h"
 #include "siphash24.h"
 #include "copy.h"
 #include "base-filesystem.h"
+#include "barrier.h"
 
 #ifdef HAVE_SECCOMP
 #include "seccomp-util.h"
@@ -3074,12 +3073,18 @@ int main(int argc, char *argv[]) {
 
         for (;;) {
                 ContainerStatus container_status;
-                int eventfds[2] = { -1, -1 };
+                _barrier_destroy_ Barrier barrier = { };
                 struct sigaction sa = {
                         .sa_handler = nop_handler,
                         .sa_flags = SA_NOCLDSTOP,
                 };
 
+                r = barrier_init(&barrier);
+                if (r < 0) {
+                        log_error("Cannot initialize IPC barrier: %s", strerror(-r));
+                        goto finish;
+                }
+
                 /* Child can be killed before execv(), so handle SIGCHLD
                  * in order to interrupt parent's blocking calls and
                  * give it a chance to call wait() and terminate. */
@@ -3095,9 +3100,9 @@ int main(int argc, char *argv[]) {
                         goto finish;
                 }
 
-                pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
-                                         (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
-                                         (arg_private_network ? CLONE_NEWNET : 0), eventfds);
+                pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
+                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
+                                          (arg_private_network ? CLONE_NEWNET : 0), NULL);
                 if (pid < 0) {
                         if (errno == EINVAL)
                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
@@ -3126,6 +3131,8 @@ int main(int argc, char *argv[]) {
                         };
                         char **env_use;
 
+                        barrier_set_role(&barrier, BARRIER_CHILD);
+
                         envp[n_env] = strv_find_prefix(environ, "TERM=");
                         if (envp[n_env])
                                 n_env ++;
@@ -3151,26 +3158,26 @@ int main(int argc, char *argv[]) {
                                 }
 
                                 log_error("Failed to open console: %s", strerror(-k));
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
                                 log_error("Failed to duplicate console: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (setsid() < 0) {
                                 log_error("setsid() failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (reset_audit_loginuid() < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
                                 log_error("PR_SET_PDEATHSIG failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         /* Mark everything as slave, so that we still
@@ -3178,113 +3185,109 @@ int main(int argc, char *argv[]) {
                          * propagate mounts to the real root. */
                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
                                 log_error("MS_SLAVE|MS_REC failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (mount_devices(arg_directory,
                                           root_device, root_device_rw,
                                           home_device, home_device_rw,
                                           srv_device, srv_device_rw) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         /* Turn directory into bind mount */
                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
                                 log_error("Failed to make bind mount: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         r = setup_volatile(arg_directory);
                         if (r < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_volatile_state(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         r = base_filesystem_create(arg_directory);
                         if (r < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (arg_read_only) {
                                 k = bind_remount_recursive(arg_directory, true);
                                 if (k < 0) {
                                         log_error("Failed to make tree read-only: %s", strerror(-k));
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         }
 
                         if (mount_all(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (copy_devnodes(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_ptmx(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         dev_setup(arg_directory);
 
                         if (setup_seccomp() < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_dev_console(arg_directory, console) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
 
                         if (setup_boot_id(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_timezone(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_resolv_conf(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_journal(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (mount_binds(arg_directory, arg_bind, false) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (mount_tmpfs(arg_directory) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         /* Tell the parent that we are ready, and that
                          * it can cgroupify us to that we lack access
                          * to certain devices and resources. */
-                        r = eventfd_send_state(eventfds[1],
-                                               EVENTFD_CHILD_SUCCEEDED);
-                        eventfds[1] = safe_close(eventfds[1]);
-                        if (r < 0)
-                                goto child_fail;
+                        barrier_place(&barrier);
 
                         if (chdir(arg_directory) < 0) {
                                 log_error("chdir(%s) failed: %m", arg_directory);
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
                                 log_error("mount(MS_MOVE) failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (chroot(".") < 0) {
                                 log_error("chroot() failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (chdir("/") < 0) {
                                 log_error("chdir() failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         umask(0022);
@@ -3294,18 +3297,18 @@ int main(int argc, char *argv[]) {
 
                         if (drop_capabilities() < 0) {
                                 log_error("drop_capabilities() failed: %m");
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         r = change_uid_gid(&home);
                         if (r < 0)
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
 
                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
                                 log_oom();
-                                goto child_fail;
+                                _exit(EXIT_FAILURE);
                         }
 
                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
@@ -3313,7 +3316,7 @@ int main(int argc, char *argv[]) {
 
                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
                                         log_oom();
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         }
 
@@ -3321,13 +3324,13 @@ int main(int argc, char *argv[]) {
                                 k = fdset_cloexec(fds, false);
                                 if (k < 0) {
                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
 
                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
                                         log_oom();
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         }
 
@@ -3336,12 +3339,12 @@ int main(int argc, char *argv[]) {
                         if (arg_personality != 0xffffffffLU) {
                                 if (personality(arg_personality) < 0) {
                                         log_error("personality() failed: %m");
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         } else if (secondary) {
                                 if (personality(PER_LINUX32) < 0) {
                                         log_error("personality() failed: %m");
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
                         }
 
@@ -3349,7 +3352,7 @@ int main(int argc, char *argv[]) {
                         if (arg_selinux_context)
                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
 #endif
 
@@ -3359,7 +3362,7 @@ int main(int argc, char *argv[]) {
                                 n = strv_env_merge(2, envp, arg_setenv);
                                 if (!n) {
                                         log_oom();
-                                        goto child_fail;
+                                        _exit(EXIT_FAILURE);
                                 }
 
                                 env_use = n;
@@ -3367,10 +3370,8 @@ int main(int argc, char *argv[]) {
                                 env_use = (char**) envp;
 
                         /* Wait until the parent is ready with the setup, too... */
-                        r = eventfd_parent_succeeded(eventfds[0]);
-                        eventfds[0] = safe_close(eventfds[0]);
-                        if (r < 0)
-                                goto child_fail;
+                        if (!barrier_place_and_sync(&barrier))
+                                _exit(EXIT_FAILURE);
 
                         if (arg_boot) {
                                 char **a;
@@ -3399,29 +3400,15 @@ int main(int argc, char *argv[]) {
                         }
 
                         log_error("execv() failed: %m");
-
-                child_fail:
-                        /* Tell the parent that the setup failed, so he
-                         * can clean up resources and terminate. */
-                        if (eventfds[1] != -1)
-                                eventfd_send_state(eventfds[1],
-                                                   EVENTFD_CHILD_FAILED);
                         _exit(EXIT_FAILURE);
                 }
 
+                barrier_set_role(&barrier, BARRIER_PARENT);
                 fdset_free(fds);
                 fds = NULL;
 
-                /* Wait for the child event:
-                 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
-                 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
-                 * it is ready with all it needs to do with priviliges.
-                 * After we got the notification we can make the process
-                 * join its cgroup which might limit what it can do */
-                r = eventfd_child_succeeded(eventfds[1]);
-                eventfds[1] = safe_close(eventfds[1]);
-
-                if (r >= 0) {
+                /* wait for child-setup to be done */
+                if (barrier_place_and_sync(&barrier)) {
                         int ifi = 0;
 
                         r = move_network_interfaces(pid);
@@ -3458,10 +3445,7 @@ int main(int argc, char *argv[]) {
                         /* Notify the child that the parent is ready with all
                          * its setup, and that the child can now hand over
                          * control to the code to run inside the container. */
-                        r = eventfd_send_state(eventfds[0], EVENTFD_PARENT_SUCCEEDED);
-                        eventfds[0] = safe_close(eventfds[0]);
-                        if (r < 0)
-                                goto finish;
+                        barrier_place(&barrier);
 
                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
                         if (k < 0) {
diff --git a/src/shared/eventfd-util.c b/src/shared/eventfd-util.c
deleted file mode 100644 (file)
index 27b7cf7..0000000
+++ /dev/null
@@ -1,169 +0,0 @@
-/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
-
-/***
-  This file is part of systemd.
-
-  Copyright 2014 Djalal Harouni
-
-  systemd is free software; you can redistribute it and/or modify it
-  under the terms of the GNU Lesser General Public License as published by
-  the Free Software Foundation; either version 2.1 of the License, or
-  (at your option) any later version.
-
-  systemd is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public License
-  along with systemd; If not, see <http://www.gnu.org/licenses/>.
-***/
-
-#include <assert.h>
-#include <errno.h>
-#include <unistd.h>
-#include <sys/eventfd.h>
-#include <sys/syscall.h>
-
-#include "eventfd-util.h"
-#include "util.h"
-
-
-/*
- * Use this to create processes that need to setup a full context
- * and sync it with their parents using cheap mechanisms.
- *
- * This will create two blocking eventfd(s). A pair for the parent and
- * the other for the child so they can be used as a notify mechanism.
- * Each process will gets its copy of the parent and child eventfds.
- *
- * This is useful in case:
- * 1) If the parent fails or dies, the child must die.
- * 2) Child will install PR_SET_PDEATHSIG as soon as possible.
- * 3) Parent and child need to sync using less resources.
- * 4) If parent is not able to install a SIGCHLD handler:
- *    parent will wait using a blocking eventfd_read() or
- *    eventfd_child_succeeded() call on the child eventfd.
- *
- *    * If the child setup succeeded, child should notify with an
- *      EVENTFD_CHILD_SUCCEEDED, parent will continue.
- *    * If the child setup failed, child should notify with an
- *      EVENTFD_CHILD_FAILED before any _exit(). This avoids blocking
- *      the parent.
- *
- * 5) If parent is able to install a SIGCHLD handler:
- *    An empty signal handler without SA_RESTART will do it, since the
- *    blocking eventfd_read() or eventfd_parent_succeeded() of the
- *    parent will be interrupted by SIGCHLD and the call will fail with
- *    EINTR. This is useful in case the child dies abnormaly and did
- *    not have a chance to notify its parent using EVENTFD_CHILD_FAILED.
- *
- * 6) Call wait*() in the main instead of the signal handler in order
- *    to: 1) reduce side effects and 2) have a better handling for
- *    child termination in order to reduce various race conditions.
- *
- *
- * The return value of clone_with_eventfd() is the same of clone().
- * On success the eventfds[] will contain the two eventfd(s). These
- * file descriptors can be closed later with safe_close(). On failure,
- * a negative value is returned in the caller's context, and errno will
- * be set appropriately.
- *
- * Extra preliminary work:
- * 1) Child can wait before starting its setup by using the
- *    eventfd_recv_start() call on the parent eventfd, in that case the
- *    parent must notify with EVENTFD_START, after doing any preliminary
- *    work.
- *
- * Note: this function depends on systemd internal functions
- * safe_close() and it should be used only by direct binaries, no
- * libraries.
- */
-pid_t clone_with_eventfd(int flags, int eventfds[2]) {
-        pid_t pid;
-
-        assert(eventfds);
-
-        eventfds[0] = eventfd(EVENTFD_INIT, EFD_CLOEXEC);
-        if (eventfds[0] < 0)
-                return -1;
-
-        eventfds[1] = eventfd(EVENTFD_INIT, EFD_CLOEXEC);
-        if (eventfds[1] < 0)
-                goto err_eventfd0;
-
-        pid = syscall(__NR_clone, flags, NULL);
-        if (pid < 0)
-                goto err_eventfd1;
-
-        return pid;
-
-err_eventfd1:
-        eventfds[1] = safe_close(eventfds[1]);
-err_eventfd0:
-        eventfds[0] = safe_close(eventfds[0]);
-        return -1;
-}
-
-int eventfd_send_state(int efd, eventfd_t s) {
-        return eventfd_write(efd, s);
-}
-
-/*
- * Receive an eventfd state on the eventfd file descriptor.
- *
- * If the third argument is set to a value other than zero, then this
- * function will compare the received value with this argument and set
- * the return value.
- *
- * On success return 0. On error, -1 will be returned, and errno will
- * be set appropriately.
- */
-int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s) {
-        int ret;
-
-        ret = eventfd_read(efd, e);
-        if (ret < 0)
-                return ret;
-        else if (s != 0 && *e != s) {
-                errno = EINVAL;
-                return -1;
-        }
-
-        return 0;
-}
-
-/*
- * Receive the EVENTFD_START state on the eventfd file descriptor.
- *
- * On Success return 0. On error, -1 will be returned, and errno will
- * be set appropriately.
- */
-int eventfd_recv_start(int efd) {
-        eventfd_t e = EVENTFD_INIT;
-        return eventfd_recv_state(efd, &e, EVENTFD_START);
-}
-
-/*
- * Receive the EVENTFD_PARENT_SUCCEEDED state on the eventfd file
- * descriptor.
- *
- * On Success return 0. On error, -1 will be returned, and errno will
- * be set appropriately.
- */
-int eventfd_parent_succeeded(int efd) {
-        eventfd_t e = EVENTFD_INIT;
-        return eventfd_recv_state(efd, &e, EVENTFD_PARENT_SUCCEEDED);
-}
-
-/*
- * Receive the EVENTFD_CHILD_SUCCEEDED state on the eventfd file
- * descriptor.
- *
- * On Success return 0. On error, -1 will be returned, and errno will
- * be set appropriately.
- */
-int eventfd_child_succeeded(int efd) {
-        eventfd_t e = EVENTFD_INIT;
-        return eventfd_recv_state(efd, &e, EVENTFD_CHILD_SUCCEEDED);
-}
diff --git a/src/shared/eventfd-util.h b/src/shared/eventfd-util.h
deleted file mode 100644 (file)
index 0120f04..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
-
-#pragma once
-
-/***
-  This file is part of systemd.
-
-  Copyright 2014 Djalal Harouni
-
-  systemd is free software; you can redistribute it and/or modify it
-  under the terms of the GNU Lesser General Public License as published by
-  the Free Software Foundation; either version 2.1 of the License, or
-  (at your option) any later version.
-
-  systemd is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public License
-  along with systemd; If not, see <http://www.gnu.org/licenses/>.
-***/
-
-#include <sys/types.h>
-#include <sys/eventfd.h>
-
-enum {
-        EVENTFD_INIT,
-        EVENTFD_START,
-        EVENTFD_PARENT_SUCCEEDED,
-        EVENTFD_PARENT_FAILED,
-        EVENTFD_CHILD_SUCCEEDED,
-        EVENTFD_CHILD_FAILED,
-};
-
-pid_t clone_with_eventfd(int flags, int eventfds[2]);
-
-int eventfd_send_state(int efd, eventfd_t s);
-int eventfd_recv_state(int efd, eventfd_t *e, eventfd_t s);
-
-int eventfd_recv_start(int efd);
-int eventfd_parent_succeeded(int efd);
-int eventfd_child_succeeded(int efd);