From: Lennart Poettering Date: Wed, 5 Sep 2012 23:23:41 +0000 (-0700) Subject: nspawn: handle poweroff/reboot nicely in containers X-Git-Tag: v190~164 X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~ianmdlvl/git?p=elogind.git;a=commitdiff_plain;h=d87be9b0af81a6e07d4fb3028e45c4409100dc26 nspawn: handle poweroff/reboot nicely in containers --- diff --git a/TODO b/TODO index e683eeaeb..c7f789b9b 100644 --- a/TODO +++ b/TODO @@ -49,15 +49,16 @@ Bugfixes: Features: +* Query Paul Moore about relabelling socket fds while they are open + * log fewer journal internal messages to the kernel kmsg * move keymaps to /usr/lib/... rather than /usr/lib/udev/... * journald: check whether it is OK if the client can still modify delivered journal entries -* json: use yajl -* json: don't add wrapping array, just put entries on one line each -* json: add -o json-pretty in addition to -o json, make the latter output one line per entry +* json: use jensson + * json: properly serialize multiple fields with the same name per entry * journalctl: make -l the default diff --git a/man/systemd-nspawn.xml b/man/systemd-nspawn.xml index 9f8b8e2ae..1f7d74e27 100644 --- a/man/systemd-nspawn.xml +++ b/man/systemd-nspawn.xml @@ -232,7 +232,7 @@ CAP_SETUID, CAP_SYS_ADMIN, CAP_SYS_CHROOT, CAP_SYS_NICE, CAP_SYS_PTRACE, CAP_SYS_TTY_CONFIG, - CAP_SYS_RESOURCE. + CAP_SYS_RESOURCE, CAP_SYS_BOOT. diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 7b1b5eab8..7f084ef2d 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -92,7 +92,8 @@ static uint64_t arg_retain = (1ULL << CAP_SYS_NICE) | (1ULL << CAP_SYS_PTRACE) | (1ULL << CAP_SYS_TTY_CONFIG) | - (1ULL << CAP_SYS_RESOURCE); + (1ULL << CAP_SYS_RESOURCE) | + (1ULL << CAP_SYS_BOOT); static int help(void) { @@ -1167,11 +1168,6 @@ int main(int argc, char *argv[]) { cfmakeraw(&raw_attr); raw_attr.c_lflag &= ~ECHO; - if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) { - log_error("Failed to set terminal attributes: %m"); - goto finish; - } - if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) { log_error("Failed to create kmsg socket pair"); goto finish; @@ -1181,232 +1177,271 @@ int main(int argc, char *argv[]) { sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1); assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0); - pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL); - if (pid < 0) { - if (errno == EINVAL) - log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m"); - else - log_error("clone() failed: %m"); + for (;;) { + siginfo_t status; - goto finish; - } + if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) { + log_error("Failed to set terminal attributes: %m"); + goto finish; + } - if (pid == 0) { - /* child */ - - const char *home = NULL; - uid_t uid = (uid_t) -1; - gid_t gid = (gid_t) -1; - const char *envp[] = { - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */ - NULL, /* TERM */ - NULL, /* HOME */ - NULL, /* USER */ - NULL, /* LOGNAME */ - NULL, /* container_uuid */ - NULL - }; - - envp[2] = strv_find_prefix(environ, "TERM="); + pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL); + if (pid < 0) { + if (errno == EINVAL) + log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m"); + else + log_error("clone() failed: %m"); - close_nointr_nofail(master); + goto finish; + } - close_nointr(STDIN_FILENO); - close_nointr(STDOUT_FILENO); - close_nointr(STDERR_FILENO); + if (pid == 0) { + /* child */ - close_all_fds(&kmsg_socket_pair[1], 1); + const char *home = NULL; + uid_t uid = (uid_t) -1; + gid_t gid = (gid_t) -1; + const char *envp[] = { + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */ + NULL, /* TERM */ + NULL, /* HOME */ + NULL, /* USER */ + NULL, /* LOGNAME */ + NULL, /* container_uuid */ + NULL + }; - reset_all_signal_handlers(); + envp[2] = strv_find_prefix(environ, "TERM="); - assert_se(sigemptyset(&mask) == 0); - assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); + close_nointr_nofail(master); - if (open_terminal(console, O_RDWR) != STDIN_FILENO || - dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO || - dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) - goto child_fail; + close_nointr(STDIN_FILENO); + close_nointr(STDOUT_FILENO); + close_nointr(STDERR_FILENO); - if (setsid() < 0) { - log_error("setsid() failed: %m"); - goto child_fail; - } + close_all_fds(&kmsg_socket_pair[1], 1); - if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) { - log_error("PR_SET_PDEATHSIG failed: %m"); - goto child_fail; - } + reset_all_signal_handlers(); - /* Mark everything as slave, so that we still - * receive mounts from the real root, but don't - * propagate mounts to the real root. */ - if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { - log_error("MS_SLAVE|MS_REC failed: %m"); - goto child_fail; - } + assert_se(sigemptyset(&mask) == 0); + assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0); - /* Turn directory into bind mount */ - if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) { - log_error("Failed to make bind mount."); - goto child_fail; - } + if (open_terminal(console, O_RDWR) != STDIN_FILENO || + dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO || + dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) + goto child_fail; - if (arg_read_only) - if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) { - log_error("Failed to make read-only."); + if (setsid() < 0) { + log_error("setsid() failed: %m"); goto child_fail; } - if (mount_all(arg_directory) < 0) - goto child_fail; - - if (copy_devnodes(arg_directory) < 0) - goto child_fail; - - dev_setup(arg_directory); - - if (setup_dev_console(arg_directory, console) < 0) - goto child_fail; - - if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0) - goto child_fail; - - close_nointr_nofail(kmsg_socket_pair[1]); + if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) { + log_error("PR_SET_PDEATHSIG failed: %m"); + goto child_fail; + } - if (setup_boot_id(arg_directory) < 0) - goto child_fail; + /* Mark everything as slave, so that we still + * receive mounts from the real root, but don't + * propagate mounts to the real root. */ + if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) { + log_error("MS_SLAVE|MS_REC failed: %m"); + goto child_fail; + } - if (setup_timezone(arg_directory) < 0) - goto child_fail; + /* Turn directory into bind mount */ + if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) { + log_error("Failed to make bind mount."); + goto child_fail; + } - if (setup_resolv_conf(arg_directory) < 0) - goto child_fail; + if (arg_read_only) + if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) { + log_error("Failed to make read-only."); + goto child_fail; + } - if (setup_journal(arg_directory) < 0) - goto child_fail; + if (mount_all(arg_directory) < 0) + goto child_fail; - if (chdir(arg_directory) < 0) { - log_error("chdir(%s) failed: %m", arg_directory); - goto child_fail; - } + if (copy_devnodes(arg_directory) < 0) + goto child_fail; - if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) { - log_error("mount(MS_MOVE) failed: %m"); - goto child_fail; - } + dev_setup(arg_directory); - if (chroot(".") < 0) { - log_error("chroot() failed: %m"); - goto child_fail; - } + if (setup_dev_console(arg_directory, console) < 0) + goto child_fail; - if (chdir("/") < 0) { - log_error("chdir() failed: %m"); - goto child_fail; - } + if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0) + goto child_fail; - umask(0022); + close_nointr_nofail(kmsg_socket_pair[1]); - loopback_setup(); + if (setup_boot_id(arg_directory) < 0) + goto child_fail; - if (drop_capabilities() < 0) { - log_error("drop_capabilities() failed: %m"); - goto child_fail; - } + if (setup_timezone(arg_directory) < 0) + goto child_fail; - if (arg_user) { + if (setup_resolv_conf(arg_directory) < 0) + goto child_fail; - if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) { - log_error("get_user_creds() failed: %m"); + if (setup_journal(arg_directory) < 0) goto child_fail; - } - if (mkdir_parents_label(home, 0775) < 0) { - log_error("mkdir_parents_label() failed: %m"); + if (chdir(arg_directory) < 0) { + log_error("chdir(%s) failed: %m", arg_directory); goto child_fail; } - if (mkdir_safe_label(home, 0775, uid, gid) < 0) { - log_error("mkdir_safe_label() failed: %m"); + if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) { + log_error("mount(MS_MOVE) failed: %m"); goto child_fail; } - if (initgroups((const char*)arg_user, gid) < 0) { - log_error("initgroups() failed: %m"); + if (chroot(".") < 0) { + log_error("chroot() failed: %m"); goto child_fail; } - if (setresgid(gid, gid, gid) < 0) { - log_error("setregid() failed: %m"); + if (chdir("/") < 0) { + log_error("chdir() failed: %m"); goto child_fail; } - if (setresuid(uid, uid, uid) < 0) { - log_error("setreuid() failed: %m"); + umask(0022); + + loopback_setup(); + + if (drop_capabilities() < 0) { + log_error("drop_capabilities() failed: %m"); goto child_fail; } - } - if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) || - (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) || - (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) { - log_oom(); - goto child_fail; - } + if (arg_user) { + + if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) { + log_error("get_user_creds() failed: %m"); + goto child_fail; + } + + if (mkdir_parents_label(home, 0775) < 0) { + log_error("mkdir_parents_label() failed: %m"); + goto child_fail; + } + + if (mkdir_safe_label(home, 0775, uid, gid) < 0) { + log_error("mkdir_safe_label() failed: %m"); + goto child_fail; + } + + if (initgroups((const char*)arg_user, gid) < 0) { + log_error("initgroups() failed: %m"); + goto child_fail; + } - if (arg_uuid) { - if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) { + if (setresgid(gid, gid, gid) < 0) { + log_error("setregid() failed: %m"); + goto child_fail; + } + + if (setresuid(uid, uid, uid) < 0) { + log_error("setreuid() failed: %m"); + goto child_fail; + } + } + + if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) || + (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) || + (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) { log_oom(); goto child_fail; } - } - setup_hostname(); + if (arg_uuid) { + if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) { + log_oom(); + goto child_fail; + } + } + + setup_hostname(); + + if (arg_boot) { + char **a; + size_t l; - if (arg_boot) { - char **a; - size_t l; + /* Automatically search for the init system */ - /* Automatically search for the init system */ + l = 1 + argc - optind; + a = newa(char*, l + 1); + memcpy(a + 1, argv + optind, l * sizeof(char*)); - l = 1 + argc - optind; - a = newa(char*, l + 1); - memcpy(a + 1, argv + optind, l * sizeof(char*)); + a[0] = (char*) "/usr/lib/systemd/systemd"; + execve(a[0], a, (char**) envp); - a[0] = (char*) "/usr/lib/systemd/systemd"; - execve(a[0], a, (char**) envp); + a[0] = (char*) "/lib/systemd/systemd"; + execve(a[0], a, (char**) envp); - a[0] = (char*) "/lib/systemd/systemd"; - execve(a[0], a, (char**) envp); + a[0] = (char*) "/sbin/init"; + execve(a[0], a, (char**) envp); + } else if (argc > optind) + execvpe(argv[optind], argv + optind, (char**) envp); + else { + chdir(home ? home : "/root"); + execle("/bin/bash", "-bash", NULL, (char**) envp); + } + + log_error("execv() failed: %m"); - a[0] = (char*) "/sbin/init"; - execve(a[0], a, (char**) envp); - } else if (argc > optind) - execvpe(argv[optind], argv + optind, (char**) envp); - else { - chdir(home ? home : "/root"); - execle("/bin/bash", "-bash", NULL, (char**) envp); + child_fail: + _exit(EXIT_FAILURE); } - log_error("execv() failed: %m"); + if (process_pty(master, &mask) < 0) + goto finish; - child_fail: - _exit(EXIT_FAILURE); - } - if (process_pty(master, &mask) < 0) - goto finish; + if (saved_attr_valid) + tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr); - if (saved_attr_valid) { - tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr); - saved_attr_valid = false; - } + r = wait_for_terminate(pid, &status); + if (r < 0) { + r = EXIT_FAILURE; + break; + } - r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid); + if (status.si_code == CLD_EXITED) { + if (status.si_status != 0) { + log_error("Container failed with error code %i.", status.si_status); + r = status.si_status; + break; + } + + log_debug("Container exited successfully."); + break; + } else if (status.si_code == CLD_KILLED && + status.si_status == SIGINT) { + log_info("Container has been shut down."); + r = 0; + break; + } else if (status.si_code == CLD_KILLED && + status.si_status == SIGHUP) { + log_info("Container is being rebooted."); + continue; + } else if (status.si_code == CLD_KILLED || + status.si_code == CLD_DUMPED) { - if (r < 0) - r = EXIT_FAILURE; + log_error("Container terminated by signal %s.", signal_to_string(status.si_status)); + r = EXIT_FAILURE; + break; + } else { + log_error("Container failed due to unknown reason."); + r = EXIT_FAILURE; + break; + } + } finish: if (saved_attr_valid) diff --git a/src/shared/util.c b/src/shared/util.c index 95b577be0..4f5cb26e3 100644 --- a/src/shared/util.c +++ b/src/shared/util.c @@ -4011,7 +4011,8 @@ int wait_for_terminate_and_warn(const char *name, pid_t pid) { assert(name); assert(pid > 1); - if ((r = wait_for_terminate(pid, &status)) < 0) { + r = wait_for_terminate(pid, &status); + if (r < 0) { log_warning("Failed to wait for %s: %s", name, strerror(-r)); return r; } @@ -4034,7 +4035,6 @@ int wait_for_terminate_and_warn(const char *name, pid_t pid) { log_warning("%s failed due to unknown reason.", name); return -EPROTO; - } _noreturn_ void freeze(void) {