chiark / gitweb /
nspawn: add --version
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60
61 typedef enum LinkJournal {
62         LINK_NO,
63         LINK_AUTO,
64         LINK_HOST,
65         LINK_GUEST
66 } LinkJournal;
67
68 static char *arg_directory = NULL;
69 static char *arg_user = NULL;
70 static char **arg_controllers = NULL;
71 static char *arg_uuid = NULL;
72 static bool arg_private_network = false;
73 static bool arg_read_only = false;
74 static bool arg_boot = false;
75 static LinkJournal arg_link_journal = LINK_AUTO;
76 static uint64_t arg_retain =
77         (1ULL << CAP_CHOWN) |
78         (1ULL << CAP_DAC_OVERRIDE) |
79         (1ULL << CAP_DAC_READ_SEARCH) |
80         (1ULL << CAP_FOWNER) |
81         (1ULL << CAP_FSETID) |
82         (1ULL << CAP_IPC_OWNER) |
83         (1ULL << CAP_KILL) |
84         (1ULL << CAP_LEASE) |
85         (1ULL << CAP_LINUX_IMMUTABLE) |
86         (1ULL << CAP_NET_BIND_SERVICE) |
87         (1ULL << CAP_NET_BROADCAST) |
88         (1ULL << CAP_NET_RAW) |
89         (1ULL << CAP_SETGID) |
90         (1ULL << CAP_SETFCAP) |
91         (1ULL << CAP_SETPCAP) |
92         (1ULL << CAP_SETUID) |
93         (1ULL << CAP_SYS_ADMIN) |
94         (1ULL << CAP_SYS_CHROOT) |
95         (1ULL << CAP_SYS_NICE) |
96         (1ULL << CAP_SYS_PTRACE) |
97         (1ULL << CAP_SYS_TTY_CONFIG) |
98         (1ULL << CAP_SYS_RESOURCE) |
99         (1ULL << CAP_SYS_BOOT);
100
101 static int help(void) {
102
103         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
104                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
105                "  -h --help               Show this help\n"
106                "  --version               Print version string\n"
107                "  -D --directory=NAME     Root directory for the container\n"
108                "  -b --boot               Boot up full system (i.e. invoke init)\n"
109                "  -u --user=USER          Run the command under specified user or uid\n"
110                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
111                "     --uuid=UUID          Set a specific machine UUID for the container\n"
112                "     --private-network    Disable network in container\n"
113                "     --read-only          Mount the root directory read-only\n"
114                "     --capability=CAP     In addition to the default, retain specified capability\n"
115                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
116                "  -j                      Equivalent to --link-journal=host\n",
117                program_invocation_short_name);
118
119         return 0;
120 }
121
122 static int parse_argv(int argc, char *argv[]) {
123
124         enum {
125                 ARG_VERSION = 0x100,
126                 ARG_PRIVATE_NETWORK,
127                 ARG_UUID,
128                 ARG_READ_ONLY,
129                 ARG_CAPABILITY,
130                 ARG_LINK_JOURNAL
131         };
132
133         static const struct option options[] = {
134                 { "help",            no_argument,       NULL, 'h'                 },
135                 { "version",         no_argument,       NULL, ARG_VERSION         },
136                 { "directory",       required_argument, NULL, 'D'                 },
137                 { "user",            required_argument, NULL, 'u'                 },
138                 { "controllers",     required_argument, NULL, 'C'                 },
139                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
140                 { "boot",            no_argument,       NULL, 'b'                 },
141                 { "uuid",            required_argument, NULL, ARG_UUID            },
142                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
143                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
144                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
145                 { NULL,              0,                 NULL, 0                   }
146         };
147
148         int c;
149
150         assert(argc >= 0);
151         assert(argv);
152
153         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
154
155                 switch (c) {
156
157                 case 'h':
158                         help();
159                         return 0;
160
161                 case ARG_VERSION:
162                         puts(PACKAGE_STRING);
163                         puts(SYSTEMD_FEATURES);
164                         return 0;
165
166                 case 'D':
167                         free(arg_directory);
168                         arg_directory = canonicalize_file_name(optarg);
169                         if (!arg_directory) {
170                                 log_error("Failed to canonicalize root directory.");
171                                 return -ENOMEM;
172                         }
173
174                         break;
175
176                 case 'u':
177                         free(arg_user);
178                         if (!(arg_user = strdup(optarg))) {
179                                 log_error("Failed to duplicate user name.");
180                                 return -ENOMEM;
181                         }
182
183                         break;
184
185                 case 'C':
186                         strv_free(arg_controllers);
187                         arg_controllers = strv_split(optarg, ",");
188                         if (!arg_controllers) {
189                                 log_error("Failed to split controllers list.");
190                                 return -ENOMEM;
191                         }
192                         strv_uniq(arg_controllers);
193
194                         break;
195
196                 case ARG_PRIVATE_NETWORK:
197                         arg_private_network = true;
198                         break;
199
200                 case 'b':
201                         arg_boot = true;
202                         break;
203
204                 case ARG_UUID:
205                         arg_uuid = optarg;
206                         break;
207
208                 case ARG_READ_ONLY:
209                         arg_read_only = true;
210                         break;
211
212                 case ARG_CAPABILITY: {
213                         char *state, *word;
214                         size_t length;
215
216                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
217                                 cap_value_t cap;
218                                 char *t;
219
220                                 t = strndup(word, length);
221                                 if (!t)
222                                         return log_oom();
223
224                                 if (cap_from_name(t, &cap) < 0) {
225                                         log_error("Failed to parse capability %s.", t);
226                                         free(t);
227                                         return -EINVAL;
228                                 }
229
230                                 free(t);
231                                 arg_retain |= 1ULL << (uint64_t) cap;
232                         }
233
234                         break;
235                 }
236
237                 case 'j':
238                         arg_link_journal = LINK_GUEST;
239                         break;
240
241                 case ARG_LINK_JOURNAL:
242                         if (streq(optarg, "auto"))
243                                 arg_link_journal = LINK_AUTO;
244                         else if (streq(optarg, "no"))
245                                 arg_link_journal = LINK_NO;
246                         else if (streq(optarg, "guest"))
247                                 arg_link_journal = LINK_GUEST;
248                         else if (streq(optarg, "host"))
249                                 arg_link_journal = LINK_HOST;
250                         else {
251                                 log_error("Failed to parse link journal mode %s", optarg);
252                                 return -EINVAL;
253                         }
254
255                         break;
256
257                 case '?':
258                         return -EINVAL;
259
260                 default:
261                         log_error("Unknown option code %c", c);
262                         return -EINVAL;
263                 }
264         }
265
266         return 1;
267 }
268
269 static int mount_all(const char *dest) {
270
271         typedef struct MountPoint {
272                 const char *what;
273                 const char *where;
274                 const char *type;
275                 const char *options;
276                 unsigned long flags;
277                 bool fatal;
278         } MountPoint;
279
280         static const MountPoint mount_table[] = {
281                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
282                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
283                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
284                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
285                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
286                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
287                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
288                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
289 #ifdef HAVE_SELINUX
290                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
291                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
292 #endif
293         };
294
295         unsigned k;
296         int r = 0;
297
298         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
299                 char _cleanup_free_ *where = NULL;
300                 int t;
301
302                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
303                         log_oom();
304
305                         if (r == 0)
306                                 r = -ENOMEM;
307
308                         break;
309                 }
310
311                 t = path_is_mount_point(where, true);
312                 if (t < 0) {
313                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
314
315                         if (r == 0)
316                                 r = t;
317
318                         continue;
319                 }
320
321                 /* Skip this entry if it is not a remount. */
322                 if (mount_table[k].what && t > 0)
323                         continue;
324
325                 mkdir_p_label(where, 0755);
326
327                 if (mount(mount_table[k].what,
328                           where,
329                           mount_table[k].type,
330                           mount_table[k].flags,
331                           mount_table[k].options) < 0 &&
332                     mount_table[k].fatal) {
333
334                         log_error("mount(%s) failed: %m", where);
335
336                         if (r == 0)
337                                 r = -errno;
338                 }
339         }
340
341         return r;
342 }
343
344 static int setup_timezone(const char *dest) {
345         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
346         char *z, *y;
347         int r;
348
349         assert(dest);
350
351         /* Fix the timezone, if possible */
352         r = readlink_malloc("/etc/localtime", &p);
353         if (r < 0) {
354                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
355                 return 0;
356         }
357
358         z = path_startswith(p, "../usr/share/zoneinfo/");
359         if (!z)
360                 z = path_startswith(p, "/usr/share/zoneinfo/");
361         if (!z) {
362                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
363                 return 0;
364         }
365
366         where = strappend(dest, "/etc/localtime");
367         if (!where)
368                 return log_oom();
369
370         r = readlink_malloc(where, &q);
371         if (r >= 0) {
372                 y = path_startswith(q, "../usr/share/zoneinfo/");
373                 if (!y)
374                         y = path_startswith(q, "/usr/share/zoneinfo/");
375
376
377                 /* Already pointing to the right place? Then do nothing .. */
378                 if (y && streq(y, z))
379                         return 0;
380         }
381
382         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
383         if (!check)
384                 return log_oom();
385
386         if (access(check, F_OK) < 0) {
387                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
388                 return 0;
389         }
390
391         what = strappend("../usr/share/zoneinfo/", z);
392         if (!what)
393                 return log_oom();
394
395         unlink(where);
396         if (symlink(what, where) < 0) {
397                 log_error("Failed to correct timezone of container: %m");
398                 return 0;
399         }
400
401         return 0;
402 }
403
404 static int setup_resolv_conf(const char *dest) {
405         char *where;
406
407         assert(dest);
408
409         if (arg_private_network)
410                 return 0;
411
412         /* Fix resolv.conf, if possible */
413         where = strappend(dest, "/etc/resolv.conf");
414         if (!where)
415                 return log_oom();
416
417         /* We don't really care for the results of this really. If it
418          * fails, it fails, but meh... */
419         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
420                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
421
422         free(where);
423
424         return 0;
425 }
426
427 static int setup_boot_id(const char *dest) {
428         char _cleanup_free_ *from = NULL, *to = NULL;
429         sd_id128_t rnd;
430         char as_uuid[37];
431         int r;
432
433         assert(dest);
434
435         /* Generate a new randomized boot ID, so that each boot-up of
436          * the container gets a new one */
437
438         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
439         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
440         if (!from || !to)
441                 return log_oom();
442
443         r = sd_id128_randomize(&rnd);
444         if (r < 0) {
445                 log_error("Failed to generate random boot id: %s", strerror(-r));
446                 return r;
447         }
448
449         snprintf(as_uuid, sizeof(as_uuid),
450                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
451                  SD_ID128_FORMAT_VAL(rnd));
452         char_array_0(as_uuid);
453
454         r = write_one_line_file(from, as_uuid);
455         if (r < 0) {
456                 log_error("Failed to write boot id: %s", strerror(-r));
457                 return r;
458         }
459
460         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
461                 log_error("Failed to bind mount boot id: %m");
462                 r = -errno;
463         } else
464                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
465
466         unlink(from);
467         return r;
468 }
469
470 static int copy_devnodes(const char *dest) {
471
472         static const char devnodes[] =
473                 "null\0"
474                 "zero\0"
475                 "full\0"
476                 "random\0"
477                 "urandom\0"
478                 "tty\0"
479                 "ptmx\0";
480
481         const char *d;
482         int r = 0;
483         mode_t _cleanup_umask_ u;
484
485         assert(dest);
486
487         u = umask(0000);
488
489         NULSTR_FOREACH(d, devnodes) {
490                 struct stat st;
491                 char _cleanup_free_ *from = NULL, *to = NULL;
492
493                 asprintf(&from, "/dev/%s", d);
494                 asprintf(&to, "%s/dev/%s", dest, d);
495
496                 if (!from || !to) {
497                         log_oom();
498
499                         if (r == 0)
500                                 r = -ENOMEM;
501
502                         break;
503                 }
504
505                 if (stat(from, &st) < 0) {
506
507                         if (errno != ENOENT) {
508                                 log_error("Failed to stat %s: %m", from);
509                                 if (r == 0)
510                                         r = -errno;
511                         }
512
513                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
514
515                         log_error("%s is not a char or block device, cannot copy", from);
516                         if (r == 0)
517                                 r = -EIO;
518
519                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
520
521                         log_error("mknod(%s) failed: %m", dest);
522                         if (r == 0)
523                                 r = -errno;
524                 }
525         }
526
527         return r;
528 }
529
530 static int setup_dev_console(const char *dest, const char *console) {
531         struct stat st;
532         char _cleanup_free_ *to = NULL;
533         int r;
534         mode_t _cleanup_umask_ u;
535
536         assert(dest);
537         assert(console);
538
539         u = umask(0000);
540
541         if (stat(console, &st) < 0) {
542                 log_error("Failed to stat %s: %m", console);
543                 return -errno;
544
545         } else if (!S_ISCHR(st.st_mode)) {
546                 log_error("/dev/console is not a char device");
547                 return -EIO;
548         }
549
550         r = chmod_and_chown(console, 0600, 0, 0);
551         if (r < 0) {
552                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
553                 return r;
554         }
555
556         if (asprintf(&to, "%s/dev/console", dest) < 0)
557                 return log_oom();
558
559         /* We need to bind mount the right tty to /dev/console since
560          * ptys can only exist on pts file systems. To have something
561          * to bind mount things on we create a device node first, that
562          * has the right major/minor (note that the major minor
563          * doesn't actually matter here, since we mount it over
564          * anyway). */
565
566         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
567                 log_error("mknod() for /dev/console failed: %m");
568                 return -errno;
569         }
570
571         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
572                 log_error("Bind mount for /dev/console failed: %m");
573                 return -errno;
574         }
575
576         return 0;
577 }
578
579 static int setup_kmsg(const char *dest, int kmsg_socket) {
580         char _cleanup_free_ *from = NULL, *to = NULL;
581         int r, fd, k;
582         mode_t _cleanup_umask_ u;
583         union {
584                 struct cmsghdr cmsghdr;
585                 uint8_t buf[CMSG_SPACE(sizeof(int))];
586         } control;
587         struct msghdr mh;
588         struct cmsghdr *cmsg;
589
590         assert(dest);
591         assert(kmsg_socket >= 0);
592
593         u = umask(0000);
594
595         /* We create the kmsg FIFO as /dev/kmsg, but immediately
596          * delete it after bind mounting it to /proc/kmsg. While FIFOs
597          * on the reading side behave very similar to /proc/kmsg,
598          * their writing side behaves differently from /dev/kmsg in
599          * that writing blocks when nothing is reading. In order to
600          * avoid any problems with containers deadlocking due to this
601          * we simply make /dev/kmsg unavailable to the container. */
602         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
603             asprintf(&to, "%s/proc/kmsg", dest) < 0)
604                 return log_oom();
605
606         if (mkfifo(from, 0600) < 0) {
607                 log_error("mkfifo() for /dev/kmsg failed: %m");
608                 return -errno;
609         }
610
611         r = chmod_and_chown(from, 0600, 0, 0);
612         if (r < 0) {
613                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
614                 return r;
615         }
616
617         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
618                 log_error("Bind mount for /proc/kmsg failed: %m");
619                 return -errno;
620         }
621
622         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
623         if (fd < 0) {
624                 log_error("Failed to open fifo: %m");
625                 return -errno;
626         }
627
628         zero(mh);
629         zero(control);
630
631         mh.msg_control = &control;
632         mh.msg_controllen = sizeof(control);
633
634         cmsg = CMSG_FIRSTHDR(&mh);
635         cmsg->cmsg_level = SOL_SOCKET;
636         cmsg->cmsg_type = SCM_RIGHTS;
637         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
638         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
639
640         mh.msg_controllen = cmsg->cmsg_len;
641
642         /* Store away the fd in the socket, so that it stays open as
643          * long as we run the child */
644         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
645         close_nointr_nofail(fd);
646
647         if (k < 0) {
648                 log_error("Failed to send FIFO fd: %m");
649                 return -errno;
650         }
651
652         /* And now make the FIFO unavailable as /dev/kmsg... */
653         unlink(from);
654         return 0;
655 }
656
657 static int setup_hostname(void) {
658         char *hn;
659         int r = 0;
660
661         hn = path_get_file_name(arg_directory);
662         if (hn) {
663                 hn = strdup(hn);
664                 if (!hn)
665                         return -ENOMEM;
666
667                 hostname_cleanup(hn);
668
669                 if (!isempty(hn))
670                         if (sethostname(hn, strlen(hn)) < 0)
671                                 r = -errno;
672
673                 free(hn);
674         }
675
676         return r;
677 }
678
679 static int setup_journal(const char *directory) {
680         sd_id128_t machine_id;
681         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
682         char *id;
683         int r;
684
685         if (arg_link_journal == LINK_NO)
686                 return 0;
687
688         p = strappend(directory, "/etc/machine-id");
689         if (!p)
690                 return log_oom();
691
692         r = read_one_line_file(p, &b);
693         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
694                 return 0;
695         else if (r < 0) {
696                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
697                 return r;
698         }
699
700         id = strstrip(b);
701         if (isempty(id) && arg_link_journal == LINK_AUTO)
702                 return 0;
703
704         /* Verify validity */
705         r = sd_id128_from_string(id, &machine_id);
706         if (r < 0) {
707                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
708                 return r;
709         }
710
711         free(p);
712         p = strappend("/var/log/journal/", id);
713         q = strjoin(directory, "/var/log/journal/", id, NULL);
714         if (!p || !q)
715                 return log_oom();
716
717         if (path_is_mount_point(p, false) > 0) {
718                 if (arg_link_journal != LINK_AUTO) {
719                         log_error("%s: already a mount point, refusing to use for journal", p);
720                         return -EEXIST;
721                 }
722
723                 return 0;
724         }
725
726         if (path_is_mount_point(q, false) > 0) {
727                 if (arg_link_journal != LINK_AUTO) {
728                         log_error("%s: already a mount point, refusing to use for journal", q);
729                         return -EEXIST;
730                 }
731
732                 return 0;
733         }
734
735         r = readlink_and_make_absolute(p, &d);
736         if (r >= 0) {
737                 if ((arg_link_journal == LINK_GUEST ||
738                      arg_link_journal == LINK_AUTO) &&
739                     path_equal(d, q)) {
740
741                         r = mkdir_p(q, 0755);
742                         if (r < 0)
743                                 log_warning("failed to create directory %s: %m", q);
744                         return 0;
745                 }
746
747                 if (unlink(p) < 0) {
748                         log_error("Failed to remove symlink %s: %m", p);
749                         return -errno;
750                 }
751         } else if (r == -EINVAL) {
752
753                 if (arg_link_journal == LINK_GUEST &&
754                     rmdir(p) < 0) {
755
756                         if (errno == ENOTDIR) {
757                                 log_error("%s already exists and is neither a symlink nor a directory", p);
758                                 return r;
759                         } else {
760                                 log_error("Failed to remove %s: %m", p);
761                                 return -errno;
762                         }
763                 }
764         } else if (r != -ENOENT) {
765                 log_error("readlink(%s) failed: %m", p);
766                 return r;
767         }
768
769         if (arg_link_journal == LINK_GUEST) {
770
771                 if (symlink(q, p) < 0) {
772                         log_error("Failed to symlink %s to %s: %m", q, p);
773                         return -errno;
774                 }
775
776                 r = mkdir_p(q, 0755);
777                 if (r < 0)
778                         log_warning("failed to create directory %s: %m", q);
779                 return 0;
780         }
781
782         if (arg_link_journal == LINK_HOST) {
783                 r = mkdir_p(p, 0755);
784                 if (r < 0) {
785                         log_error("Failed to create %s: %m", p);
786                         return r;
787                 }
788
789         } else if (access(p, F_OK) < 0)
790                 return 0;
791
792         if (dir_is_empty(q) == 0) {
793                 log_error("%s not empty.", q);
794                 return -ENOTEMPTY;
795         }
796
797         r = mkdir_p(q, 0755);
798         if (r < 0) {
799                 log_error("Failed to create %s: %m", q);
800                 return r;
801         }
802
803         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
804                 log_error("Failed to bind mount journal from host into guest: %m");
805                 return -errno;
806         }
807
808         return 0;
809 }
810
811 static int drop_capabilities(void) {
812         return capability_bounding_set_drop(~arg_retain, false);
813 }
814
815 static int is_os_tree(const char *path) {
816         int r;
817         char *p;
818         /* We use /bin/sh as flag file if something is an OS */
819
820         if (asprintf(&p, "%s/bin/sh", path) < 0)
821                 return -ENOMEM;
822
823         r = access(p, F_OK);
824         free(p);
825
826         return r < 0 ? 0 : 1;
827 }
828
829 static int process_pty(int master, pid_t pid, sigset_t *mask) {
830
831         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
832         size_t in_buffer_full = 0, out_buffer_full = 0;
833         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
834         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
835         int ep = -1, signal_fd = -1, r;
836         bool tried_orderly_shutdown = false;
837
838         assert(master >= 0);
839         assert(pid > 0);
840         assert(mask);
841
842         fd_nonblock(STDIN_FILENO, 1);
843         fd_nonblock(STDOUT_FILENO, 1);
844         fd_nonblock(master, 1);
845
846         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
847         if (signal_fd < 0) {
848                 log_error("signalfd(): %m");
849                 r = -errno;
850                 goto finish;
851         }
852
853         ep = epoll_create1(EPOLL_CLOEXEC);
854         if (ep < 0) {
855                 log_error("Failed to create epoll: %m");
856                 r = -errno;
857                 goto finish;
858         }
859
860         /* We read from STDIN only if this is actually a TTY,
861          * otherwise we assume non-interactivity. */
862         if (isatty(STDIN_FILENO)) {
863                 zero(stdin_ev);
864                 stdin_ev.events = EPOLLIN|EPOLLET;
865                 stdin_ev.data.fd = STDIN_FILENO;
866
867                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
868                         log_error("Failed to register STDIN in epoll: %m");
869                         r = -errno;
870                         goto finish;
871                 }
872         }
873
874         zero(stdout_ev);
875         stdout_ev.events = EPOLLOUT|EPOLLET;
876         stdout_ev.data.fd = STDOUT_FILENO;
877
878         zero(master_ev);
879         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
880         master_ev.data.fd = master;
881
882         zero(signal_ev);
883         signal_ev.events = EPOLLIN;
884         signal_ev.data.fd = signal_fd;
885
886         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
887             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
888             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
889                 log_error("Failed to register fds in epoll: %m");
890                 r = -errno;
891                 goto finish;
892         }
893
894         for (;;) {
895                 struct epoll_event ev[16];
896                 ssize_t k;
897                 int i, nfds;
898
899                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
900                 if (nfds < 0) {
901
902                         if (errno == EINTR || errno == EAGAIN)
903                                 continue;
904
905                         log_error("epoll_wait(): %m");
906                         r = -errno;
907                         goto finish;
908                 }
909
910                 assert(nfds >= 1);
911
912                 for (i = 0; i < nfds; i++) {
913                         if (ev[i].data.fd == STDIN_FILENO) {
914
915                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
916                                         stdin_readable = true;
917
918                         } else if (ev[i].data.fd == STDOUT_FILENO) {
919
920                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
921                                         stdout_writable = true;
922
923                         } else if (ev[i].data.fd == master) {
924
925                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
926                                         master_readable = true;
927
928                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
929                                         master_writable = true;
930
931                         } else if (ev[i].data.fd == signal_fd) {
932                                 struct signalfd_siginfo sfsi;
933                                 ssize_t n;
934
935                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
936                                 if (n != sizeof(sfsi)) {
937
938                                         if (n >= 0) {
939                                                 log_error("Failed to read from signalfd: invalid block size");
940                                                 r = -EIO;
941                                                 goto finish;
942                                         }
943
944                                         if (errno != EINTR && errno != EAGAIN) {
945                                                 log_error("Failed to read from signalfd: %m");
946                                                 r = -errno;
947                                                 goto finish;
948                                         }
949                                 } else {
950
951                                         if (sfsi.ssi_signo == SIGWINCH) {
952                                                 struct winsize ws;
953
954                                                 /* The window size changed, let's forward that. */
955                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
956                                                         ioctl(master, TIOCSWINSZ, &ws);
957                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
958
959                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
960
961                                                 /* This only works for systemd... */
962                                                 tried_orderly_shutdown = true;
963                                                 kill(pid, SIGRTMIN+3);
964
965                                         } else {
966                                                 r = 0;
967                                                 goto finish;
968                                         }
969                                 }
970                         }
971                 }
972
973                 while ((stdin_readable && in_buffer_full <= 0) ||
974                        (master_writable && in_buffer_full > 0) ||
975                        (master_readable && out_buffer_full <= 0) ||
976                        (stdout_writable && out_buffer_full > 0)) {
977
978                         if (stdin_readable && in_buffer_full < LINE_MAX) {
979
980                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
981                                 if (k < 0) {
982
983                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
984                                                 stdin_readable = false;
985                                         else {
986                                                 log_error("read(): %m");
987                                                 r = -errno;
988                                                 goto finish;
989                                         }
990                                 } else
991                                         in_buffer_full += (size_t) k;
992                         }
993
994                         if (master_writable && in_buffer_full > 0) {
995
996                                 k = write(master, in_buffer, in_buffer_full);
997                                 if (k < 0) {
998
999                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1000                                                 master_writable = false;
1001                                         else {
1002                                                 log_error("write(): %m");
1003                                                 r = -errno;
1004                                                 goto finish;
1005                                         }
1006
1007                                 } else {
1008                                         assert(in_buffer_full >= (size_t) k);
1009                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1010                                         in_buffer_full -= k;
1011                                 }
1012                         }
1013
1014                         if (master_readable && out_buffer_full < LINE_MAX) {
1015
1016                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1017                                 if (k < 0) {
1018
1019                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1020                                                 master_readable = false;
1021                                         else {
1022                                                 log_error("read(): %m");
1023                                                 r = -errno;
1024                                                 goto finish;
1025                                         }
1026                                 }  else
1027                                         out_buffer_full += (size_t) k;
1028                         }
1029
1030                         if (stdout_writable && out_buffer_full > 0) {
1031
1032                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1033                                 if (k < 0) {
1034
1035                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1036                                                 stdout_writable = false;
1037                                         else {
1038                                                 log_error("write(): %m");
1039                                                 r = -errno;
1040                                                 goto finish;
1041                                         }
1042
1043                                 } else {
1044                                         assert(out_buffer_full >= (size_t) k);
1045                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1046                                         out_buffer_full -= k;
1047                                 }
1048                         }
1049                 }
1050         }
1051
1052 finish:
1053         if (ep >= 0)
1054                 close_nointr_nofail(ep);
1055
1056         if (signal_fd >= 0)
1057                 close_nointr_nofail(signal_fd);
1058
1059         return r;
1060 }
1061
1062 int main(int argc, char *argv[]) {
1063         pid_t pid = 0;
1064         int r = EXIT_FAILURE, k;
1065         char *oldcg = NULL, *newcg = NULL;
1066         char **controller = NULL;
1067         int master = -1, n_fd_passed;
1068         const char *console = NULL;
1069         struct termios saved_attr, raw_attr;
1070         sigset_t mask;
1071         bool saved_attr_valid = false;
1072         struct winsize ws;
1073         int kmsg_socket_pair[2] = { -1, -1 };
1074         FDSet *fds = NULL;
1075
1076         log_parse_environment();
1077         log_open();
1078
1079         r = parse_argv(argc, argv);
1080         if (r <= 0)
1081                 goto finish;
1082
1083         if (arg_directory) {
1084                 char *p;
1085
1086                 p = path_make_absolute_cwd(arg_directory);
1087                 free(arg_directory);
1088                 arg_directory = p;
1089         } else
1090                 arg_directory = get_current_dir_name();
1091
1092         if (!arg_directory) {
1093                 log_error("Failed to determine path");
1094                 goto finish;
1095         }
1096
1097         path_kill_slashes(arg_directory);
1098
1099         if (geteuid() != 0) {
1100                 log_error("Need to be root.");
1101                 goto finish;
1102         }
1103
1104         if (sd_booted() <= 0) {
1105                 log_error("Not running on a systemd system.");
1106                 goto finish;
1107         }
1108
1109         if (path_equal(arg_directory, "/")) {
1110                 log_error("Spawning container on root directory not supported.");
1111                 goto finish;
1112         }
1113
1114         if (is_os_tree(arg_directory) <= 0) {
1115                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1116                 goto finish;
1117         }
1118
1119         log_close();
1120         n_fd_passed = sd_listen_fds(false);
1121         if (n_fd_passed > 0) {
1122                 k = fdset_new_listen_fds(&fds, false);
1123                 if (k < 0) {
1124                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1125                         goto finish;
1126                 }
1127         }
1128         fdset_close_others(fds);
1129         log_open();
1130
1131         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1132         if (k < 0) {
1133                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1134                 goto finish;
1135         }
1136
1137         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1138                 log_error("Failed to allocate cgroup path.");
1139                 goto finish;
1140         }
1141
1142         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1143         if (k < 0)  {
1144                 log_error("Failed to create cgroup: %s", strerror(-k));
1145                 goto finish;
1146         }
1147
1148         STRV_FOREACH(controller, arg_controllers) {
1149                 k = cg_create_and_attach(*controller, newcg, 0);
1150                 if (k < 0)
1151                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1152         }
1153
1154         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1155         if (master < 0) {
1156                 log_error("Failed to acquire pseudo tty: %m");
1157                 goto finish;
1158         }
1159
1160         console = ptsname(master);
1161         if (!console) {
1162                 log_error("Failed to determine tty name: %m");
1163                 goto finish;
1164         }
1165
1166         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1167
1168         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1169                 ioctl(master, TIOCSWINSZ, &ws);
1170
1171         if (unlockpt(master) < 0) {
1172                 log_error("Failed to unlock tty: %m");
1173                 goto finish;
1174         }
1175
1176         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1177                 saved_attr_valid = true;
1178
1179                 raw_attr = saved_attr;
1180                 cfmakeraw(&raw_attr);
1181                 raw_attr.c_lflag &= ~ECHO;
1182         }
1183
1184         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1185                 log_error("Failed to create kmsg socket pair");
1186                 goto finish;
1187         }
1188
1189         assert_se(sigemptyset(&mask) == 0);
1190         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1191         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1192
1193         for (;;) {
1194                 siginfo_t status;
1195
1196                 if (saved_attr_valid) {
1197                         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1198                                 log_error("Failed to set terminal attributes: %m");
1199                                 goto finish;
1200                         }
1201                 }
1202
1203                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1204                 if (pid < 0) {
1205                         if (errno == EINVAL)
1206                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1207                         else
1208                                 log_error("clone() failed: %m");
1209
1210                         goto finish;
1211                 }
1212
1213                 if (pid == 0) {
1214                         /* child */
1215
1216                         const char *home = NULL;
1217                         uid_t uid = (uid_t) -1;
1218                         gid_t gid = (gid_t) -1;
1219                         unsigned n_env = 0;
1220                         const char *envp[] = {
1221                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1222                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1223                                 NULL, /* TERM */
1224                                 NULL, /* HOME */
1225                                 NULL, /* USER */
1226                                 NULL, /* LOGNAME */
1227                                 NULL, /* container_uuid */
1228                                 NULL, /* LISTEN_FDS */
1229                                 NULL, /* LISTEN_PID */
1230                                 NULL
1231                         };
1232
1233                         envp[2] = strv_find_prefix(environ, "TERM=");
1234                         n_env = 3;
1235
1236                         close_nointr_nofail(master);
1237                         master = -1;
1238
1239                         close_nointr(STDIN_FILENO);
1240                         close_nointr(STDOUT_FILENO);
1241                         close_nointr(STDERR_FILENO);
1242
1243                         close_nointr_nofail(kmsg_socket_pair[0]);
1244                         kmsg_socket_pair[0] = -1;
1245
1246                         reset_all_signal_handlers();
1247
1248                         assert_se(sigemptyset(&mask) == 0);
1249                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1250
1251                         k = open_terminal(console, O_RDWR);
1252                         if (k != STDIN_FILENO) {
1253                                 if (k >= 0) {
1254                                         close_nointr_nofail(k);
1255                                         k = -EINVAL;
1256                                 }
1257
1258                                 log_error("Failed to open console: %s", strerror(-k));
1259                                 goto child_fail;
1260                         }
1261
1262                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1263                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1264                                 log_error("Failed to duplicate console: %m");
1265                                 goto child_fail;
1266                         }
1267
1268                         if (setsid() < 0) {
1269                                 log_error("setsid() failed: %m");
1270                                 goto child_fail;
1271                         }
1272
1273                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1274                                 log_error("PR_SET_PDEATHSIG failed: %m");
1275                                 goto child_fail;
1276                         }
1277
1278                         /* Mark everything as slave, so that we still
1279                          * receive mounts from the real root, but don't
1280                          * propagate mounts to the real root. */
1281                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1282                                 log_error("MS_SLAVE|MS_REC failed: %m");
1283                                 goto child_fail;
1284                         }
1285
1286                         /* Turn directory into bind mount */
1287                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1288                                 log_error("Failed to make bind mount.");
1289                                 goto child_fail;
1290                         }
1291
1292                         if (arg_read_only)
1293                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1294                                         log_error("Failed to make read-only.");
1295                                         goto child_fail;
1296                                 }
1297
1298                         if (mount_all(arg_directory) < 0)
1299                                 goto child_fail;
1300
1301                         if (copy_devnodes(arg_directory) < 0)
1302                                 goto child_fail;
1303
1304                         dev_setup(arg_directory);
1305
1306                         if (setup_dev_console(arg_directory, console) < 0)
1307                                 goto child_fail;
1308
1309                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1310                                 goto child_fail;
1311
1312                         close_nointr_nofail(kmsg_socket_pair[1]);
1313                         kmsg_socket_pair[1] = -1;
1314
1315                         if (setup_boot_id(arg_directory) < 0)
1316                                 goto child_fail;
1317
1318                         if (setup_timezone(arg_directory) < 0)
1319                                 goto child_fail;
1320
1321                         if (setup_resolv_conf(arg_directory) < 0)
1322                                 goto child_fail;
1323
1324                         if (setup_journal(arg_directory) < 0)
1325                                 goto child_fail;
1326
1327                         if (chdir(arg_directory) < 0) {
1328                                 log_error("chdir(%s) failed: %m", arg_directory);
1329                                 goto child_fail;
1330                         }
1331
1332                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1333                                 log_error("mount(MS_MOVE) failed: %m");
1334                                 goto child_fail;
1335                         }
1336
1337                         if (chroot(".") < 0) {
1338                                 log_error("chroot() failed: %m");
1339                                 goto child_fail;
1340                         }
1341
1342                         if (chdir("/") < 0) {
1343                                 log_error("chdir() failed: %m");
1344                                 goto child_fail;
1345                         }
1346
1347                         umask(0022);
1348
1349                         loopback_setup();
1350
1351                         if (drop_capabilities() < 0) {
1352                                 log_error("drop_capabilities() failed: %m");
1353                                 goto child_fail;
1354                         }
1355
1356                         if (arg_user) {
1357
1358                                 /* Note that this resolves user names
1359                                  * inside the container, and hence
1360                                  * accesses the NSS modules from the
1361                                  * container and not the host. This is
1362                                  * a bit weird... */
1363
1364                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1365                                         log_error("get_user_creds() failed: %m");
1366                                         goto child_fail;
1367                                 }
1368
1369                                 if (mkdir_parents_label(home, 0775) < 0) {
1370                                         log_error("mkdir_parents_label() failed: %m");
1371                                         goto child_fail;
1372                                 }
1373
1374                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1375                                         log_error("mkdir_safe_label() failed: %m");
1376                                         goto child_fail;
1377                                 }
1378
1379                                 if (initgroups((const char*)arg_user, gid) < 0) {
1380                                         log_error("initgroups() failed: %m");
1381                                         goto child_fail;
1382                                 }
1383
1384                                 if (setresgid(gid, gid, gid) < 0) {
1385                                         log_error("setregid() failed: %m");
1386                                         goto child_fail;
1387                                 }
1388
1389                                 if (setresuid(uid, uid, uid) < 0) {
1390                                         log_error("setreuid() failed: %m");
1391                                         goto child_fail;
1392                                 }
1393                         } else {
1394                                 /* Reset everything fully to 0, just in case */
1395
1396                                 if (setgroups(0, NULL) < 0) {
1397                                         log_error("setgroups() failed: %m");
1398                                         goto child_fail;
1399                                 }
1400
1401                                 if (setresgid(0, 0, 0) < 0) {
1402                                         log_error("setregid() failed: %m");
1403                                         goto child_fail;
1404                                 }
1405
1406                                 if (setresuid(0, 0, 0) < 0) {
1407                                         log_error("setreuid() failed: %m");
1408                                         goto child_fail;
1409                                 }
1410                         }
1411
1412                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1413                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1414                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1415                                 log_oom();
1416                                 goto child_fail;
1417                         }
1418
1419                         if (arg_uuid) {
1420                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1421                                         log_oom();
1422                                         goto child_fail;
1423                                 }
1424                         }
1425
1426                         if (fdset_size(fds) > 0) {
1427                                 k = fdset_cloexec(fds, false);
1428                                 if (k < 0) {
1429                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1430                                         goto child_fail;
1431                                 }
1432
1433                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1434                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1435                                         log_oom();
1436                                         goto child_fail;
1437                                 }
1438                         }
1439
1440                         setup_hostname();
1441
1442                         if (arg_boot) {
1443                                 char **a;
1444                                 size_t l;
1445
1446                                 /* Automatically search for the init system */
1447
1448                                 l = 1 + argc - optind;
1449                                 a = newa(char*, l + 1);
1450                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1451
1452                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1453                                 execve(a[0], a, (char**) envp);
1454
1455                                 a[0] = (char*) "/lib/systemd/systemd";
1456                                 execve(a[0], a, (char**) envp);
1457
1458                                 a[0] = (char*) "/sbin/init";
1459                                 execve(a[0], a, (char**) envp);
1460                         } else if (argc > optind)
1461                                 execvpe(argv[optind], argv + optind, (char**) envp);
1462                         else {
1463                                 chdir(home ? home : "/root");
1464                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1465                         }
1466
1467                         log_error("execv() failed: %m");
1468
1469                 child_fail:
1470                         _exit(EXIT_FAILURE);
1471                 }
1472
1473                 fdset_free(fds);
1474                 fds = NULL;
1475
1476                 if (process_pty(master, pid, &mask) < 0)
1477                         goto finish;
1478
1479                 if (saved_attr_valid)
1480                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1481
1482                 r = wait_for_terminate(pid, &status);
1483                 if (r < 0) {
1484                         r = EXIT_FAILURE;
1485                         break;
1486                 }
1487
1488                 if (status.si_code == CLD_EXITED) {
1489                         if (status.si_status != 0) {
1490                                 log_error("Container failed with error code %i.", status.si_status);
1491                                 r = status.si_status;
1492                                 break;
1493                         }
1494
1495                         log_debug("Container exited successfully.");
1496                         break;
1497                 } else if (status.si_code == CLD_KILLED &&
1498                            status.si_status == SIGINT) {
1499                         log_info("Container has been shut down.");
1500                         r = 0;
1501                         break;
1502                 } else if (status.si_code == CLD_KILLED &&
1503                            status.si_status == SIGHUP) {
1504                         log_info("Container is being rebooted.");
1505                         continue;
1506                 } else if (status.si_code == CLD_KILLED ||
1507                            status.si_code == CLD_DUMPED) {
1508
1509                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1510                         r = EXIT_FAILURE;
1511                         break;
1512                 } else {
1513                         log_error("Container failed due to unknown reason.");
1514                         r = EXIT_FAILURE;
1515                         break;
1516                 }
1517         }
1518
1519 finish:
1520         if (saved_attr_valid)
1521                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1522
1523         if (master >= 0)
1524                 close_nointr_nofail(master);
1525
1526         close_pipe(kmsg_socket_pair);
1527
1528         if (oldcg)
1529                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1530
1531         if (newcg)
1532                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1533
1534         free(arg_directory);
1535         strv_free(arg_controllers);
1536         free(oldcg);
1537         free(newcg);
1538
1539         fdset_free(fds);
1540
1541         return r;
1542 }