chiark / gitweb /
nspawn: make nspawn robust to container failure
[elogind.git] / src / nspawn / nspawn.c
index 2d627db9c7fc364af4f9d2c9ab233fcfd1d4b807..a1d77244f8c38edab93f1139cff95a3af6355f1a 100644 (file)
@@ -84,6 +84,7 @@
 #include "def.h"
 #include "rtnl-util.h"
 #include "udev-util.h"
+#include "eventfd-util.h"
 #include "blkid-util.h"
 #include "gpt.h"
 #include "siphash24.h"
 #include "seccomp-util.h"
 #endif
 
+typedef enum ContainerStatus {
+        CONTAINER_TERMINATED,
+        CONTAINER_REBOOTED
+} ContainerStatus;
+
 typedef enum LinkJournal {
         LINK_NO,
         LINK_AUTO,
@@ -657,7 +663,7 @@ static int mount_binds(const char *dest, char **l, unsigned long flags) {
                                 return r;
                         }
                 } else {
-                        log_error("Failed to bind mount %s: %s", *x, strerror(errno));
+                        log_error("Failed to bind mount %s: %m", *x);
                         return -errno;
                 }
                 /* Create the mount point, but be conservative -- refuse to create block
@@ -769,6 +775,15 @@ static int setup_resolv_conf(const char *dest) {
         return 0;
 }
 
+static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
+
+        snprintf(s, 37,
+                 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+                 SD_ID128_FORMAT_VAL(id));
+
+        return s;
+}
+
 static int setup_boot_id(const char *dest) {
         _cleanup_free_ char *from = NULL, *to = NULL;
         sd_id128_t rnd = {};
@@ -794,10 +809,7 @@ static int setup_boot_id(const char *dest) {
                 return r;
         }
 
-        snprintf(as_uuid, sizeof(as_uuid),
-                 "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-                 SD_ID128_FORMAT_VAL(rnd));
-        char_array_0(as_uuid);
+        id128_format_as_uuid(rnd, as_uuid);
 
         r = write_string_file(from, as_uuid);
         if (r < 0) {
@@ -1137,10 +1149,8 @@ static int setup_journal(const char *directory) {
         } else if (access(p, F_OK) < 0)
                 return 0;
 
-        if (dir_is_empty(q) == 0) {
-                log_error("%s not empty.", q);
-                return -ENOTEMPTY;
-        }
+        if (dir_is_empty(q) == 0)
+                log_warning("%s is not empty, proceeding anyway.", q);
 
         r = mkdir_p(q, 0755);
         if (r < 0) {
@@ -1486,13 +1496,7 @@ static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
                 return r;
         }
 
-        r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
-        if (r < 0) {
-                log_error("Failed to append netlink kind: %s", strerror(-r));
-                return r;
-        }
-
-        r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
+        r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
         if (r < 0) {
                 log_error("Failed to open netlink container: %s", strerror(-r));
                 return r;
@@ -1757,13 +1761,7 @@ static int setup_macvlan(pid_t pid) {
                         return r;
                 }
 
-                r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
-                if (r < 0) {
-                        log_error("Failed to append netlink kind: %s", strerror(-r));
-                        return r;
-                }
-
-                r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
+                r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
                 if (r < 0) {
                         log_error("Failed to open netlink container: %s", strerror(-r));
                         return r;
@@ -2378,7 +2376,7 @@ static int change_uid_gid(char **_home) {
         _cleanup_fclose_ FILE *f = NULL;
         _cleanup_close_ int fd = -1;
         unsigned n_uids = 0;
-        size_t sz, l;
+        size_t sz = 0, l;
         uid_t uid;
         gid_t gid;
         pid_t pid;
@@ -2577,19 +2575,89 @@ static int change_uid_gid(char **_home) {
         return 0;
 }
 
+/*
+ * Return 0 in case the container is being rebooted, has been shut
+ * down or exited successfully. On failures a negative value is
+ * returned.
+ *
+ * The status of the container "CONTAINER_TERMINATED" or
+ * "CONTAINER_REBOOTED" will be saved in the container argument
+ */
+static int wait_for_container(pid_t pid, ContainerStatus *container) {
+        int r;
+        siginfo_t status;
+
+        r = wait_for_terminate(pid, &status);
+        if (r < 0)
+                return r;
+
+        switch (status.si_code) {
+        case CLD_EXITED:
+                r = status.si_status;
+                if (r == 0) {
+                        if (!arg_quiet)
+                                log_debug("Container %s exited successfully.",
+                                          arg_machine);
+
+                        *container = CONTAINER_TERMINATED;
+                } else {
+                        log_error("Container %s failed with error code %i.",
+                                  arg_machine, status.si_status);
+                        r = -1;
+                }
+                break;
+
+        case CLD_KILLED:
+                if (status.si_status == SIGINT) {
+                        if (!arg_quiet)
+                                log_info("Container %s has been shut down.",
+                                         arg_machine);
+
+                        *container = CONTAINER_TERMINATED;
+                        r = 0;
+                        break;
+                } else if (status.si_status == SIGHUP) {
+                        if (!arg_quiet)
+                                log_info("Container %s is being rebooted.",
+                                         arg_machine);
+
+                        *container = CONTAINER_REBOOTED;
+                        r = 0;
+                        break;
+                }
+                /* CLD_KILLED fallthrough */
+
+        case CLD_DUMPED:
+                log_error("Container %s terminated by signal %s.",
+                          arg_machine, signal_to_string(status.si_status));
+                r = -1;
+                break;
+
+        default:
+                log_error("Container %s failed due to unknown reason.",
+                          arg_machine);
+                r = -1;
+                break;
+        }
+
+        return r;
+}
+
+static void nop_handler(int sig) {}
+
 int main(int argc, char *argv[]) {
 
         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
-        _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
+        _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
         _cleanup_fdset_free_ FDSet *fds = NULL;
         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
         const char *console = NULL;
         char veth_name[IFNAMSIZ];
         bool secondary = false;
+        sigset_t mask, mask_chld;
         pid_t pid = 0;
-        sigset_t mask;
 
         log_parse_environment();
         log_open();
@@ -2751,36 +2819,44 @@ int main(int argc, char *argv[]) {
         sd_notify(0, "READY=1");
 
         assert_se(sigemptyset(&mask) == 0);
+        assert_se(sigemptyset(&mask_chld) == 0);
+        sigaddset(&mask_chld, SIGCHLD);
         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
 
         for (;;) {
-                int parent_ready_fd = -1, child_ready_fd = -1;
-                siginfo_t status;
-                eventfd_t x;
-
-                parent_ready_fd = eventfd(0, EFD_CLOEXEC);
-                if (parent_ready_fd < 0) {
-                        log_error("Failed to create event fd: %m");
+                ContainerStatus container_status;
+                int eventfds[2] = { -1, -1 };
+                struct sigaction sa = {
+                        .sa_handler = nop_handler,
+                        .sa_flags = SA_NOCLDSTOP,
+                };
+
+                /* Child can be killed before execv(), so handle SIGCHLD
+                 * in order to interrupt parent's blocking calls and
+                 * give it a chance to call wait() and terminate. */
+                r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
+                if (r < 0) {
+                        log_error("Failed to change the signal mask: %m");
                         goto finish;
                 }
 
-                child_ready_fd = eventfd(0, EFD_CLOEXEC);
-                if (child_ready_fd < 0) {
-                        log_error("Failed to create event fd: %m");
+                r = sigaction(SIGCHLD, &sa, NULL);
+                if (r < 0) {
+                        log_error("Failed to install SIGCHLD handler: %m");
                         goto finish;
                 }
 
-                pid = syscall(__NR_clone,
-                              SIGCHLD|CLONE_NEWNS|
-                              (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
-                              (arg_private_network ? CLONE_NEWNET : 0), NULL);
+                pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
+                                         (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
+                                         (arg_private_network ? CLONE_NEWNET : 0), eventfds);
                 if (pid < 0) {
                         if (errno == EINVAL)
                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
                         else
                                 log_error("clone() failed: %m");
 
+                        r = pid;
                         goto finish;
                 }
 
@@ -2921,8 +2997,11 @@ int main(int argc, char *argv[]) {
                         /* Tell the parent that we are ready, and that
                          * it can cgroupify us to that we lack access
                          * to certain devices and resources. */
-                        eventfd_write(child_ready_fd, 1);
-                        child_ready_fd = safe_close(child_ready_fd);
+                        r = eventfd_send_state(eventfds[1],
+                                               EVENTFD_CHILD_SUCCEEDED);
+                        eventfds[1] = safe_close(eventfds[1]);
+                        if (r < 0)
+                                goto child_fail;
 
                         if (chdir(arg_directory) < 0) {
                                 log_error("chdir(%s) failed: %m", arg_directory);
@@ -2966,7 +3045,9 @@ int main(int argc, char *argv[]) {
                         }
 
                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
-                                if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
+                                char as_uuid[37];
+
+                                if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
                                         log_oom();
                                         goto child_fail;
                                 }
@@ -3022,8 +3103,10 @@ int main(int argc, char *argv[]) {
                                 env_use = (char**) envp;
 
                         /* Wait until the parent is ready with the setup, too... */
-                        eventfd_read(parent_ready_fd, &x);
-                        parent_ready_fd = safe_close(parent_ready_fd);
+                        r = eventfd_parent_succeeded(eventfds[0]);
+                        eventfds[0] = safe_close(eventfds[0]);
+                        if (r < 0)
+                                goto child_fail;
 
                         if (arg_boot) {
                                 char **a;
@@ -3054,17 +3137,27 @@ int main(int argc, char *argv[]) {
                         log_error("execv() failed: %m");
 
                 child_fail:
+                        /* Tell the parent that the setup failed, so he
+                         * can clean up resources and terminate. */
+                        if (eventfds[1] != -1)
+                                eventfd_send_state(eventfds[1],
+                                                   EVENTFD_CHILD_FAILED);
                         _exit(EXIT_FAILURE);
                 }
 
                 fdset_free(fds);
                 fds = NULL;
 
-                /* Wait until the child reported that it is ready with
-                 * all it needs to do with priviliges. After we got
-                 * the notification we can make the process join its
-                 * cgroup which might limit what it can do */
-                eventfd_read(child_ready_fd, &x);
+                /* Wait for the child event:
+                 * If EVENTFD_CHILD_FAILED, the child will terminate soon.
+                 * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
+                 * it is ready with all it needs to do with priviliges.
+                 * After we got the notification we can make the process
+                 * join its cgroup which might limit what it can do */
+                r = eventfd_child_succeeded(eventfds[1]);
+                eventfds[1] = safe_close(eventfds[1]);
+                if (r < 0)
+                        goto check_container_status;
 
                 r = register_machine(pid);
                 if (r < 0)
@@ -3086,10 +3179,25 @@ int main(int argc, char *argv[]) {
                 if (r < 0)
                         goto finish;
 
+                /* Block SIGCHLD here, before notifying child.
+                 * process_pty() will handle it with the other signals. */
+                r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
+                if (r < 0)
+                        goto finish;
+
+                /* Reset signal to default */
+                r = default_signals(SIGCHLD, -1);
+                if (r < 0)
+                        goto finish;
+
                 /* Notify the child that the parent is ready with all
-                 * its setup, and thtat the child can now hand over
+                 * its setup, and that the child can now hand over
                  * control to the code to run inside the container. */
-                eventfd_write(parent_ready_fd, 1);
+                r = eventfd_send_state(eventfds[0],
+                                       EVENTFD_PARENT_SUCCEEDED);
+                eventfds[0] = safe_close(eventfds[0]);
+                if (r < 0)
+                        goto finish;
 
                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
                 if (k < 0) {
@@ -3103,51 +3211,20 @@ int main(int argc, char *argv[]) {
                 /* Kill if it is not dead yet anyway */
                 terminate_machine(pid);
 
+check_container_status:
                 /* Redundant, but better safe than sorry */
                 kill(pid, SIGKILL);
 
-                k = wait_for_terminate(pid, &status);
+                r = wait_for_container(pid, &container_status);
                 pid = 0;
 
-                if (k < 0) {
+                if (r < 0) {
                         r = EXIT_FAILURE;
                         break;
-                }
-
-                if (status.si_code == CLD_EXITED) {
-                        r = status.si_status;
-                        if (status.si_status != 0) {
-                                log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
-                                break;
-                        }
-
-                        if (!arg_quiet)
-                                log_debug("Container %s exited successfully.", arg_machine);
-                        break;
-                } else if (status.si_code == CLD_KILLED &&
-                           status.si_status == SIGINT) {
-
-                        if (!arg_quiet)
-                                log_info("Container %s has been shut down.", arg_machine);
-                        r = 0;
+                } else if (container_status == CONTAINER_TERMINATED)
                         break;
-                } else if (status.si_code == CLD_KILLED &&
-                           status.si_status == SIGHUP) {
 
-                        if (!arg_quiet)
-                                log_info("Container %s is being rebooted.", arg_machine);
-                        continue;
-                } else if (status.si_code == CLD_KILLED ||
-                           status.si_code == CLD_DUMPED) {
-
-                        log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
-                        r = EXIT_FAILURE;
-                        break;
-                } else {
-                        log_error("Container %s failed due to unknown reason.", arg_machine);
-                        r = EXIT_FAILURE;
-                        break;
-                }
+                /* CONTAINER_REBOOTED, loop again */
         }
 
 finish: