chiark / gitweb /
nspawn: assume stdout is always writable if it does not support epoll
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60
61 typedef enum LinkJournal {
62         LINK_NO,
63         LINK_AUTO,
64         LINK_HOST,
65         LINK_GUEST
66 } LinkJournal;
67
68 static char *arg_directory = NULL;
69 static char *arg_user = NULL;
70 static char **arg_controllers = NULL;
71 static char *arg_uuid = NULL;
72 static bool arg_private_network = false;
73 static bool arg_read_only = false;
74 static bool arg_boot = false;
75 static LinkJournal arg_link_journal = LINK_AUTO;
76 static uint64_t arg_retain =
77         (1ULL << CAP_CHOWN) |
78         (1ULL << CAP_DAC_OVERRIDE) |
79         (1ULL << CAP_DAC_READ_SEARCH) |
80         (1ULL << CAP_FOWNER) |
81         (1ULL << CAP_FSETID) |
82         (1ULL << CAP_IPC_OWNER) |
83         (1ULL << CAP_KILL) |
84         (1ULL << CAP_LEASE) |
85         (1ULL << CAP_LINUX_IMMUTABLE) |
86         (1ULL << CAP_NET_BIND_SERVICE) |
87         (1ULL << CAP_NET_BROADCAST) |
88         (1ULL << CAP_NET_RAW) |
89         (1ULL << CAP_SETGID) |
90         (1ULL << CAP_SETFCAP) |
91         (1ULL << CAP_SETPCAP) |
92         (1ULL << CAP_SETUID) |
93         (1ULL << CAP_SYS_ADMIN) |
94         (1ULL << CAP_SYS_CHROOT) |
95         (1ULL << CAP_SYS_NICE) |
96         (1ULL << CAP_SYS_PTRACE) |
97         (1ULL << CAP_SYS_TTY_CONFIG) |
98         (1ULL << CAP_SYS_RESOURCE) |
99         (1ULL << CAP_SYS_BOOT) |
100         (1ULL << CAP_AUDIT_WRITE) |
101         (1ULL << CAP_AUDIT_CONTROL);
102
103 static int help(void) {
104
105         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
106                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
107                "  -h --help               Show this help\n"
108                "  --version               Print version string\n"
109                "  -D --directory=NAME     Root directory for the container\n"
110                "  -b --boot               Boot up full system (i.e. invoke init)\n"
111                "  -u --user=USER          Run the command under specified user or uid\n"
112                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
113                "     --uuid=UUID          Set a specific machine UUID for the container\n"
114                "     --private-network    Disable network in container\n"
115                "     --read-only          Mount the root directory read-only\n"
116                "     --capability=CAP     In addition to the default, retain specified capability\n"
117                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
118                "  -j                      Equivalent to --link-journal=host\n",
119                program_invocation_short_name);
120
121         return 0;
122 }
123
124 static int parse_argv(int argc, char *argv[]) {
125
126         enum {
127                 ARG_VERSION = 0x100,
128                 ARG_PRIVATE_NETWORK,
129                 ARG_UUID,
130                 ARG_READ_ONLY,
131                 ARG_CAPABILITY,
132                 ARG_LINK_JOURNAL
133         };
134
135         static const struct option options[] = {
136                 { "help",            no_argument,       NULL, 'h'                 },
137                 { "version",         no_argument,       NULL, ARG_VERSION         },
138                 { "directory",       required_argument, NULL, 'D'                 },
139                 { "user",            required_argument, NULL, 'u'                 },
140                 { "controllers",     required_argument, NULL, 'C'                 },
141                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
142                 { "boot",            no_argument,       NULL, 'b'                 },
143                 { "uuid",            required_argument, NULL, ARG_UUID            },
144                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
145                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
146                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
147                 { NULL,              0,                 NULL, 0                   }
148         };
149
150         int c;
151
152         assert(argc >= 0);
153         assert(argv);
154
155         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
156
157                 switch (c) {
158
159                 case 'h':
160                         help();
161                         return 0;
162
163                 case ARG_VERSION:
164                         puts(PACKAGE_STRING);
165                         puts(SYSTEMD_FEATURES);
166                         return 0;
167
168                 case 'D':
169                         free(arg_directory);
170                         arg_directory = canonicalize_file_name(optarg);
171                         if (!arg_directory) {
172                                 log_error("Failed to canonicalize root directory.");
173                                 return -ENOMEM;
174                         }
175
176                         break;
177
178                 case 'u':
179                         free(arg_user);
180                         if (!(arg_user = strdup(optarg))) {
181                                 log_error("Failed to duplicate user name.");
182                                 return -ENOMEM;
183                         }
184
185                         break;
186
187                 case 'C':
188                         strv_free(arg_controllers);
189                         arg_controllers = strv_split(optarg, ",");
190                         if (!arg_controllers) {
191                                 log_error("Failed to split controllers list.");
192                                 return -ENOMEM;
193                         }
194                         strv_uniq(arg_controllers);
195
196                         break;
197
198                 case ARG_PRIVATE_NETWORK:
199                         arg_private_network = true;
200                         break;
201
202                 case 'b':
203                         arg_boot = true;
204                         break;
205
206                 case ARG_UUID:
207                         arg_uuid = optarg;
208                         break;
209
210                 case ARG_READ_ONLY:
211                         arg_read_only = true;
212                         break;
213
214                 case ARG_CAPABILITY: {
215                         char *state, *word;
216                         size_t length;
217
218                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
219                                 cap_value_t cap;
220                                 char *t;
221
222                                 t = strndup(word, length);
223                                 if (!t)
224                                         return log_oom();
225
226                                 if (cap_from_name(t, &cap) < 0) {
227                                         log_error("Failed to parse capability %s.", t);
228                                         free(t);
229                                         return -EINVAL;
230                                 }
231
232                                 free(t);
233                                 arg_retain |= 1ULL << (uint64_t) cap;
234                         }
235
236                         break;
237                 }
238
239                 case 'j':
240                         arg_link_journal = LINK_GUEST;
241                         break;
242
243                 case ARG_LINK_JOURNAL:
244                         if (streq(optarg, "auto"))
245                                 arg_link_journal = LINK_AUTO;
246                         else if (streq(optarg, "no"))
247                                 arg_link_journal = LINK_NO;
248                         else if (streq(optarg, "guest"))
249                                 arg_link_journal = LINK_GUEST;
250                         else if (streq(optarg, "host"))
251                                 arg_link_journal = LINK_HOST;
252                         else {
253                                 log_error("Failed to parse link journal mode %s", optarg);
254                                 return -EINVAL;
255                         }
256
257                         break;
258
259                 case '?':
260                         return -EINVAL;
261
262                 default:
263                         log_error("Unknown option code %c", c);
264                         return -EINVAL;
265                 }
266         }
267
268         return 1;
269 }
270
271 static int mount_all(const char *dest) {
272
273         typedef struct MountPoint {
274                 const char *what;
275                 const char *where;
276                 const char *type;
277                 const char *options;
278                 unsigned long flags;
279                 bool fatal;
280         } MountPoint;
281
282         static const MountPoint mount_table[] = {
283                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
284                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
285                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
286                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
287                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
288                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
289                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
290                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
291 #ifdef HAVE_SELINUX
292                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
293                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
294 #endif
295         };
296
297         unsigned k;
298         int r = 0;
299
300         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
301                 char _cleanup_free_ *where = NULL;
302                 int t;
303
304                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
305                         log_oom();
306
307                         if (r == 0)
308                                 r = -ENOMEM;
309
310                         break;
311                 }
312
313                 t = path_is_mount_point(where, true);
314                 if (t < 0) {
315                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
316
317                         if (r == 0)
318                                 r = t;
319
320                         continue;
321                 }
322
323                 /* Skip this entry if it is not a remount. */
324                 if (mount_table[k].what && t > 0)
325                         continue;
326
327                 mkdir_p_label(where, 0755);
328
329                 if (mount(mount_table[k].what,
330                           where,
331                           mount_table[k].type,
332                           mount_table[k].flags,
333                           mount_table[k].options) < 0 &&
334                     mount_table[k].fatal) {
335
336                         log_error("mount(%s) failed: %m", where);
337
338                         if (r == 0)
339                                 r = -errno;
340                 }
341         }
342
343         return r;
344 }
345
346 static int setup_timezone(const char *dest) {
347         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
348         char *z, *y;
349         int r;
350
351         assert(dest);
352
353         /* Fix the timezone, if possible */
354         r = readlink_malloc("/etc/localtime", &p);
355         if (r < 0) {
356                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
357                 return 0;
358         }
359
360         z = path_startswith(p, "../usr/share/zoneinfo/");
361         if (!z)
362                 z = path_startswith(p, "/usr/share/zoneinfo/");
363         if (!z) {
364                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
365                 return 0;
366         }
367
368         where = strappend(dest, "/etc/localtime");
369         if (!where)
370                 return log_oom();
371
372         r = readlink_malloc(where, &q);
373         if (r >= 0) {
374                 y = path_startswith(q, "../usr/share/zoneinfo/");
375                 if (!y)
376                         y = path_startswith(q, "/usr/share/zoneinfo/");
377
378
379                 /* Already pointing to the right place? Then do nothing .. */
380                 if (y && streq(y, z))
381                         return 0;
382         }
383
384         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
385         if (!check)
386                 return log_oom();
387
388         if (access(check, F_OK) < 0) {
389                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
390                 return 0;
391         }
392
393         what = strappend("../usr/share/zoneinfo/", z);
394         if (!what)
395                 return log_oom();
396
397         unlink(where);
398         if (symlink(what, where) < 0) {
399                 log_error("Failed to correct timezone of container: %m");
400                 return 0;
401         }
402
403         return 0;
404 }
405
406 static int setup_resolv_conf(const char *dest) {
407         char *where;
408
409         assert(dest);
410
411         if (arg_private_network)
412                 return 0;
413
414         /* Fix resolv.conf, if possible */
415         where = strappend(dest, "/etc/resolv.conf");
416         if (!where)
417                 return log_oom();
418
419         /* We don't really care for the results of this really. If it
420          * fails, it fails, but meh... */
421         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
422                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
423
424         free(where);
425
426         return 0;
427 }
428
429 static int setup_boot_id(const char *dest) {
430         char _cleanup_free_ *from = NULL, *to = NULL;
431         sd_id128_t rnd;
432         char as_uuid[37];
433         int r;
434
435         assert(dest);
436
437         /* Generate a new randomized boot ID, so that each boot-up of
438          * the container gets a new one */
439
440         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
441         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
442         if (!from || !to)
443                 return log_oom();
444
445         r = sd_id128_randomize(&rnd);
446         if (r < 0) {
447                 log_error("Failed to generate random boot id: %s", strerror(-r));
448                 return r;
449         }
450
451         snprintf(as_uuid, sizeof(as_uuid),
452                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
453                  SD_ID128_FORMAT_VAL(rnd));
454         char_array_0(as_uuid);
455
456         r = write_one_line_file(from, as_uuid);
457         if (r < 0) {
458                 log_error("Failed to write boot id: %s", strerror(-r));
459                 return r;
460         }
461
462         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
463                 log_error("Failed to bind mount boot id: %m");
464                 r = -errno;
465         } else
466                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
467
468         unlink(from);
469         return r;
470 }
471
472 static int copy_devnodes(const char *dest) {
473
474         static const char devnodes[] =
475                 "null\0"
476                 "zero\0"
477                 "full\0"
478                 "random\0"
479                 "urandom\0"
480                 "tty\0"
481                 "ptmx\0";
482
483         const char *d;
484         int r = 0;
485         mode_t _cleanup_umask_ u;
486
487         assert(dest);
488
489         u = umask(0000);
490
491         NULSTR_FOREACH(d, devnodes) {
492                 struct stat st;
493                 char _cleanup_free_ *from = NULL, *to = NULL;
494
495                 asprintf(&from, "/dev/%s", d);
496                 asprintf(&to, "%s/dev/%s", dest, d);
497
498                 if (!from || !to) {
499                         log_oom();
500
501                         if (r == 0)
502                                 r = -ENOMEM;
503
504                         break;
505                 }
506
507                 if (stat(from, &st) < 0) {
508
509                         if (errno != ENOENT) {
510                                 log_error("Failed to stat %s: %m", from);
511                                 if (r == 0)
512                                         r = -errno;
513                         }
514
515                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
516
517                         log_error("%s is not a char or block device, cannot copy", from);
518                         if (r == 0)
519                                 r = -EIO;
520
521                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
522
523                         log_error("mknod(%s) failed: %m", dest);
524                         if (r == 0)
525                                 r = -errno;
526                 }
527         }
528
529         return r;
530 }
531
532 static int setup_dev_console(const char *dest, const char *console) {
533         struct stat st;
534         char _cleanup_free_ *to = NULL;
535         int r;
536         mode_t _cleanup_umask_ u;
537
538         assert(dest);
539         assert(console);
540
541         u = umask(0000);
542
543         if (stat(console, &st) < 0) {
544                 log_error("Failed to stat %s: %m", console);
545                 return -errno;
546
547         } else if (!S_ISCHR(st.st_mode)) {
548                 log_error("/dev/console is not a char device");
549                 return -EIO;
550         }
551
552         r = chmod_and_chown(console, 0600, 0, 0);
553         if (r < 0) {
554                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
555                 return r;
556         }
557
558         if (asprintf(&to, "%s/dev/console", dest) < 0)
559                 return log_oom();
560
561         /* We need to bind mount the right tty to /dev/console since
562          * ptys can only exist on pts file systems. To have something
563          * to bind mount things on we create a device node first, that
564          * has the right major/minor (note that the major minor
565          * doesn't actually matter here, since we mount it over
566          * anyway). */
567
568         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
569                 log_error("mknod() for /dev/console failed: %m");
570                 return -errno;
571         }
572
573         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
574                 log_error("Bind mount for /dev/console failed: %m");
575                 return -errno;
576         }
577
578         return 0;
579 }
580
581 static int setup_kmsg(const char *dest, int kmsg_socket) {
582         char _cleanup_free_ *from = NULL, *to = NULL;
583         int r, fd, k;
584         mode_t _cleanup_umask_ u;
585         union {
586                 struct cmsghdr cmsghdr;
587                 uint8_t buf[CMSG_SPACE(sizeof(int))];
588         } control;
589         struct msghdr mh;
590         struct cmsghdr *cmsg;
591
592         assert(dest);
593         assert(kmsg_socket >= 0);
594
595         u = umask(0000);
596
597         /* We create the kmsg FIFO as /dev/kmsg, but immediately
598          * delete it after bind mounting it to /proc/kmsg. While FIFOs
599          * on the reading side behave very similar to /proc/kmsg,
600          * their writing side behaves differently from /dev/kmsg in
601          * that writing blocks when nothing is reading. In order to
602          * avoid any problems with containers deadlocking due to this
603          * we simply make /dev/kmsg unavailable to the container. */
604         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
605             asprintf(&to, "%s/proc/kmsg", dest) < 0)
606                 return log_oom();
607
608         if (mkfifo(from, 0600) < 0) {
609                 log_error("mkfifo() for /dev/kmsg failed: %m");
610                 return -errno;
611         }
612
613         r = chmod_and_chown(from, 0600, 0, 0);
614         if (r < 0) {
615                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
616                 return r;
617         }
618
619         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
620                 log_error("Bind mount for /proc/kmsg failed: %m");
621                 return -errno;
622         }
623
624         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
625         if (fd < 0) {
626                 log_error("Failed to open fifo: %m");
627                 return -errno;
628         }
629
630         zero(mh);
631         zero(control);
632
633         mh.msg_control = &control;
634         mh.msg_controllen = sizeof(control);
635
636         cmsg = CMSG_FIRSTHDR(&mh);
637         cmsg->cmsg_level = SOL_SOCKET;
638         cmsg->cmsg_type = SCM_RIGHTS;
639         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
640         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
641
642         mh.msg_controllen = cmsg->cmsg_len;
643
644         /* Store away the fd in the socket, so that it stays open as
645          * long as we run the child */
646         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
647         close_nointr_nofail(fd);
648
649         if (k < 0) {
650                 log_error("Failed to send FIFO fd: %m");
651                 return -errno;
652         }
653
654         /* And now make the FIFO unavailable as /dev/kmsg... */
655         unlink(from);
656         return 0;
657 }
658
659 static int setup_hostname(void) {
660         char *hn;
661         int r = 0;
662
663         hn = path_get_file_name(arg_directory);
664         if (hn) {
665                 hn = strdup(hn);
666                 if (!hn)
667                         return -ENOMEM;
668
669                 hostname_cleanup(hn);
670
671                 if (!isempty(hn))
672                         if (sethostname(hn, strlen(hn)) < 0)
673                                 r = -errno;
674
675                 free(hn);
676         }
677
678         return r;
679 }
680
681 static int setup_journal(const char *directory) {
682         sd_id128_t machine_id;
683         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
684         char *id;
685         int r;
686
687         if (arg_link_journal == LINK_NO)
688                 return 0;
689
690         p = strappend(directory, "/etc/machine-id");
691         if (!p)
692                 return log_oom();
693
694         r = read_one_line_file(p, &b);
695         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
696                 return 0;
697         else if (r < 0) {
698                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
699                 return r;
700         }
701
702         id = strstrip(b);
703         if (isempty(id) && arg_link_journal == LINK_AUTO)
704                 return 0;
705
706         /* Verify validity */
707         r = sd_id128_from_string(id, &machine_id);
708         if (r < 0) {
709                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
710                 return r;
711         }
712
713         free(p);
714         p = strappend("/var/log/journal/", id);
715         q = strjoin(directory, "/var/log/journal/", id, NULL);
716         if (!p || !q)
717                 return log_oom();
718
719         if (path_is_mount_point(p, false) > 0) {
720                 if (arg_link_journal != LINK_AUTO) {
721                         log_error("%s: already a mount point, refusing to use for journal", p);
722                         return -EEXIST;
723                 }
724
725                 return 0;
726         }
727
728         if (path_is_mount_point(q, false) > 0) {
729                 if (arg_link_journal != LINK_AUTO) {
730                         log_error("%s: already a mount point, refusing to use for journal", q);
731                         return -EEXIST;
732                 }
733
734                 return 0;
735         }
736
737         r = readlink_and_make_absolute(p, &d);
738         if (r >= 0) {
739                 if ((arg_link_journal == LINK_GUEST ||
740                      arg_link_journal == LINK_AUTO) &&
741                     path_equal(d, q)) {
742
743                         r = mkdir_p(q, 0755);
744                         if (r < 0)
745                                 log_warning("failed to create directory %s: %m", q);
746                         return 0;
747                 }
748
749                 if (unlink(p) < 0) {
750                         log_error("Failed to remove symlink %s: %m", p);
751                         return -errno;
752                 }
753         } else if (r == -EINVAL) {
754
755                 if (arg_link_journal == LINK_GUEST &&
756                     rmdir(p) < 0) {
757
758                         if (errno == ENOTDIR) {
759                                 log_error("%s already exists and is neither a symlink nor a directory", p);
760                                 return r;
761                         } else {
762                                 log_error("Failed to remove %s: %m", p);
763                                 return -errno;
764                         }
765                 }
766         } else if (r != -ENOENT) {
767                 log_error("readlink(%s) failed: %m", p);
768                 return r;
769         }
770
771         if (arg_link_journal == LINK_GUEST) {
772
773                 if (symlink(q, p) < 0) {
774                         log_error("Failed to symlink %s to %s: %m", q, p);
775                         return -errno;
776                 }
777
778                 r = mkdir_p(q, 0755);
779                 if (r < 0)
780                         log_warning("failed to create directory %s: %m", q);
781                 return 0;
782         }
783
784         if (arg_link_journal == LINK_HOST) {
785                 r = mkdir_p(p, 0755);
786                 if (r < 0) {
787                         log_error("Failed to create %s: %m", p);
788                         return r;
789                 }
790
791         } else if (access(p, F_OK) < 0)
792                 return 0;
793
794         if (dir_is_empty(q) == 0) {
795                 log_error("%s not empty.", q);
796                 return -ENOTEMPTY;
797         }
798
799         r = mkdir_p(q, 0755);
800         if (r < 0) {
801                 log_error("Failed to create %s: %m", q);
802                 return r;
803         }
804
805         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
806                 log_error("Failed to bind mount journal from host into guest: %m");
807                 return -errno;
808         }
809
810         return 0;
811 }
812
813 static int drop_capabilities(void) {
814         return capability_bounding_set_drop(~arg_retain, false);
815 }
816
817 static int is_os_tree(const char *path) {
818         int r;
819         char *p;
820         /* We use /bin/sh as flag file if something is an OS */
821
822         if (asprintf(&p, "%s/bin/sh", path) < 0)
823                 return -ENOMEM;
824
825         r = access(p, F_OK);
826         free(p);
827
828         return r < 0 ? 0 : 1;
829 }
830
831 static int process_pty(int master, pid_t pid, sigset_t *mask) {
832
833         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
834         size_t in_buffer_full = 0, out_buffer_full = 0;
835         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
836         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
837         int ep = -1, signal_fd = -1, r;
838         bool tried_orderly_shutdown = false;
839
840         assert(master >= 0);
841         assert(pid > 0);
842         assert(mask);
843
844         fd_nonblock(STDIN_FILENO, 1);
845         fd_nonblock(STDOUT_FILENO, 1);
846         fd_nonblock(master, 1);
847
848         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
849         if (signal_fd < 0) {
850                 log_error("signalfd(): %m");
851                 r = -errno;
852                 goto finish;
853         }
854
855         ep = epoll_create1(EPOLL_CLOEXEC);
856         if (ep < 0) {
857                 log_error("Failed to create epoll: %m");
858                 r = -errno;
859                 goto finish;
860         }
861
862         /* We read from STDIN only if this is actually a TTY,
863          * otherwise we assume non-interactivity. */
864         if (isatty(STDIN_FILENO)) {
865                 zero(stdin_ev);
866                 stdin_ev.events = EPOLLIN|EPOLLET;
867                 stdin_ev.data.fd = STDIN_FILENO;
868
869                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
870                         log_error("Failed to register STDIN in epoll: %m");
871                         r = -errno;
872                         goto finish;
873                 }
874         }
875
876         zero(stdout_ev);
877         stdout_ev.events = EPOLLOUT|EPOLLET;
878         stdout_ev.data.fd = STDOUT_FILENO;
879
880         zero(master_ev);
881         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
882         master_ev.data.fd = master;
883
884         zero(signal_ev);
885         signal_ev.events = EPOLLIN;
886         signal_ev.data.fd = signal_fd;
887
888         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
889                 if (errno != EPERM) {
890                         log_error("Failed to register stdout in epoll: %m");
891                         r = -errno;
892                         goto finish;
893                 }
894                 /* stdout without epoll support. Likely redirected to regular file. */
895                 stdout_writable = true;
896         }
897
898         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
899             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
900                 log_error("Failed to register fds in epoll: %m");
901                 r = -errno;
902                 goto finish;
903         }
904
905         for (;;) {
906                 struct epoll_event ev[16];
907                 ssize_t k;
908                 int i, nfds;
909
910                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
911                 if (nfds < 0) {
912
913                         if (errno == EINTR || errno == EAGAIN)
914                                 continue;
915
916                         log_error("epoll_wait(): %m");
917                         r = -errno;
918                         goto finish;
919                 }
920
921                 assert(nfds >= 1);
922
923                 for (i = 0; i < nfds; i++) {
924                         if (ev[i].data.fd == STDIN_FILENO) {
925
926                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
927                                         stdin_readable = true;
928
929                         } else if (ev[i].data.fd == STDOUT_FILENO) {
930
931                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
932                                         stdout_writable = true;
933
934                         } else if (ev[i].data.fd == master) {
935
936                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
937                                         master_readable = true;
938
939                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
940                                         master_writable = true;
941
942                         } else if (ev[i].data.fd == signal_fd) {
943                                 struct signalfd_siginfo sfsi;
944                                 ssize_t n;
945
946                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
947                                 if (n != sizeof(sfsi)) {
948
949                                         if (n >= 0) {
950                                                 log_error("Failed to read from signalfd: invalid block size");
951                                                 r = -EIO;
952                                                 goto finish;
953                                         }
954
955                                         if (errno != EINTR && errno != EAGAIN) {
956                                                 log_error("Failed to read from signalfd: %m");
957                                                 r = -errno;
958                                                 goto finish;
959                                         }
960                                 } else {
961
962                                         if (sfsi.ssi_signo == SIGWINCH) {
963                                                 struct winsize ws;
964
965                                                 /* The window size changed, let's forward that. */
966                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
967                                                         ioctl(master, TIOCSWINSZ, &ws);
968                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
969
970                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
971
972                                                 /* This only works for systemd... */
973                                                 tried_orderly_shutdown = true;
974                                                 kill(pid, SIGRTMIN+3);
975
976                                         } else {
977                                                 r = 0;
978                                                 goto finish;
979                                         }
980                                 }
981                         }
982                 }
983
984                 while ((stdin_readable && in_buffer_full <= 0) ||
985                        (master_writable && in_buffer_full > 0) ||
986                        (master_readable && out_buffer_full <= 0) ||
987                        (stdout_writable && out_buffer_full > 0)) {
988
989                         if (stdin_readable && in_buffer_full < LINE_MAX) {
990
991                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
992                                 if (k < 0) {
993
994                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
995                                                 stdin_readable = false;
996                                         else {
997                                                 log_error("read(): %m");
998                                                 r = -errno;
999                                                 goto finish;
1000                                         }
1001                                 } else
1002                                         in_buffer_full += (size_t) k;
1003                         }
1004
1005                         if (master_writable && in_buffer_full > 0) {
1006
1007                                 k = write(master, in_buffer, in_buffer_full);
1008                                 if (k < 0) {
1009
1010                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1011                                                 master_writable = false;
1012                                         else {
1013                                                 log_error("write(): %m");
1014                                                 r = -errno;
1015                                                 goto finish;
1016                                         }
1017
1018                                 } else {
1019                                         assert(in_buffer_full >= (size_t) k);
1020                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1021                                         in_buffer_full -= k;
1022                                 }
1023                         }
1024
1025                         if (master_readable && out_buffer_full < LINE_MAX) {
1026
1027                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1028                                 if (k < 0) {
1029
1030                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1031                                                 master_readable = false;
1032                                         else {
1033                                                 log_error("read(): %m");
1034                                                 r = -errno;
1035                                                 goto finish;
1036                                         }
1037                                 }  else
1038                                         out_buffer_full += (size_t) k;
1039                         }
1040
1041                         if (stdout_writable && out_buffer_full > 0) {
1042
1043                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1044                                 if (k < 0) {
1045
1046                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1047                                                 stdout_writable = false;
1048                                         else {
1049                                                 log_error("write(): %m");
1050                                                 r = -errno;
1051                                                 goto finish;
1052                                         }
1053
1054                                 } else {
1055                                         assert(out_buffer_full >= (size_t) k);
1056                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1057                                         out_buffer_full -= k;
1058                                 }
1059                         }
1060                 }
1061         }
1062
1063 finish:
1064         if (ep >= 0)
1065                 close_nointr_nofail(ep);
1066
1067         if (signal_fd >= 0)
1068                 close_nointr_nofail(signal_fd);
1069
1070         return r;
1071 }
1072
1073 int main(int argc, char *argv[]) {
1074         pid_t pid = 0;
1075         int r = EXIT_FAILURE, k;
1076         char *oldcg = NULL, *newcg = NULL;
1077         char **controller = NULL;
1078         int master = -1, n_fd_passed;
1079         const char *console = NULL;
1080         struct termios saved_attr, raw_attr;
1081         sigset_t mask;
1082         bool saved_attr_valid = false;
1083         struct winsize ws;
1084         int kmsg_socket_pair[2] = { -1, -1 };
1085         FDSet *fds = NULL;
1086
1087         log_parse_environment();
1088         log_open();
1089
1090         r = parse_argv(argc, argv);
1091         if (r <= 0)
1092                 goto finish;
1093
1094         if (arg_directory) {
1095                 char *p;
1096
1097                 p = path_make_absolute_cwd(arg_directory);
1098                 free(arg_directory);
1099                 arg_directory = p;
1100         } else
1101                 arg_directory = get_current_dir_name();
1102
1103         if (!arg_directory) {
1104                 log_error("Failed to determine path");
1105                 goto finish;
1106         }
1107
1108         path_kill_slashes(arg_directory);
1109
1110         if (geteuid() != 0) {
1111                 log_error("Need to be root.");
1112                 goto finish;
1113         }
1114
1115         if (sd_booted() <= 0) {
1116                 log_error("Not running on a systemd system.");
1117                 goto finish;
1118         }
1119
1120         if (path_equal(arg_directory, "/")) {
1121                 log_error("Spawning container on root directory not supported.");
1122                 goto finish;
1123         }
1124
1125         if (is_os_tree(arg_directory) <= 0) {
1126                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1127                 goto finish;
1128         }
1129
1130         log_close();
1131         n_fd_passed = sd_listen_fds(false);
1132         if (n_fd_passed > 0) {
1133                 k = fdset_new_listen_fds(&fds, false);
1134                 if (k < 0) {
1135                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1136                         goto finish;
1137                 }
1138         }
1139         fdset_close_others(fds);
1140         log_open();
1141
1142         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1143         if (k < 0) {
1144                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1145                 goto finish;
1146         }
1147
1148         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1149                 log_error("Failed to allocate cgroup path.");
1150                 goto finish;
1151         }
1152
1153         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1154         if (k < 0)  {
1155                 log_error("Failed to create cgroup: %s", strerror(-k));
1156                 goto finish;
1157         }
1158
1159         STRV_FOREACH(controller, arg_controllers) {
1160                 k = cg_create_and_attach(*controller, newcg, 0);
1161                 if (k < 0)
1162                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1163         }
1164
1165         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1166         if (master < 0) {
1167                 log_error("Failed to acquire pseudo tty: %m");
1168                 goto finish;
1169         }
1170
1171         console = ptsname(master);
1172         if (!console) {
1173                 log_error("Failed to determine tty name: %m");
1174                 goto finish;
1175         }
1176
1177         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1178
1179         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1180                 ioctl(master, TIOCSWINSZ, &ws);
1181
1182         if (unlockpt(master) < 0) {
1183                 log_error("Failed to unlock tty: %m");
1184                 goto finish;
1185         }
1186
1187         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1188                 saved_attr_valid = true;
1189
1190                 raw_attr = saved_attr;
1191                 cfmakeraw(&raw_attr);
1192                 raw_attr.c_lflag &= ~ECHO;
1193         }
1194
1195         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1196                 log_error("Failed to create kmsg socket pair");
1197                 goto finish;
1198         }
1199
1200         assert_se(sigemptyset(&mask) == 0);
1201         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1202         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1203
1204         for (;;) {
1205                 siginfo_t status;
1206
1207                 if (saved_attr_valid) {
1208                         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1209                                 log_error("Failed to set terminal attributes: %m");
1210                                 goto finish;
1211                         }
1212                 }
1213
1214                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1215                 if (pid < 0) {
1216                         if (errno == EINVAL)
1217                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1218                         else
1219                                 log_error("clone() failed: %m");
1220
1221                         goto finish;
1222                 }
1223
1224                 if (pid == 0) {
1225                         /* child */
1226
1227                         const char *home = NULL;
1228                         uid_t uid = (uid_t) -1;
1229                         gid_t gid = (gid_t) -1;
1230                         unsigned n_env = 0;
1231                         const char *envp[] = {
1232                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1233                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1234                                 NULL, /* TERM */
1235                                 NULL, /* HOME */
1236                                 NULL, /* USER */
1237                                 NULL, /* LOGNAME */
1238                                 NULL, /* container_uuid */
1239                                 NULL, /* LISTEN_FDS */
1240                                 NULL, /* LISTEN_PID */
1241                                 NULL
1242                         };
1243
1244                         envp[2] = strv_find_prefix(environ, "TERM=");
1245                         n_env = 3;
1246
1247                         close_nointr_nofail(master);
1248                         master = -1;
1249
1250                         close_nointr(STDIN_FILENO);
1251                         close_nointr(STDOUT_FILENO);
1252                         close_nointr(STDERR_FILENO);
1253
1254                         close_nointr_nofail(kmsg_socket_pair[0]);
1255                         kmsg_socket_pair[0] = -1;
1256
1257                         reset_all_signal_handlers();
1258
1259                         assert_se(sigemptyset(&mask) == 0);
1260                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1261
1262                         k = open_terminal(console, O_RDWR);
1263                         if (k != STDIN_FILENO) {
1264                                 if (k >= 0) {
1265                                         close_nointr_nofail(k);
1266                                         k = -EINVAL;
1267                                 }
1268
1269                                 log_error("Failed to open console: %s", strerror(-k));
1270                                 goto child_fail;
1271                         }
1272
1273                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1274                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1275                                 log_error("Failed to duplicate console: %m");
1276                                 goto child_fail;
1277                         }
1278
1279                         if (setsid() < 0) {
1280                                 log_error("setsid() failed: %m");
1281                                 goto child_fail;
1282                         }
1283
1284                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1285                                 log_error("PR_SET_PDEATHSIG failed: %m");
1286                                 goto child_fail;
1287                         }
1288
1289                         /* Mark everything as slave, so that we still
1290                          * receive mounts from the real root, but don't
1291                          * propagate mounts to the real root. */
1292                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1293                                 log_error("MS_SLAVE|MS_REC failed: %m");
1294                                 goto child_fail;
1295                         }
1296
1297                         /* Turn directory into bind mount */
1298                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1299                                 log_error("Failed to make bind mount.");
1300                                 goto child_fail;
1301                         }
1302
1303                         if (arg_read_only)
1304                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1305                                         log_error("Failed to make read-only.");
1306                                         goto child_fail;
1307                                 }
1308
1309                         if (mount_all(arg_directory) < 0)
1310                                 goto child_fail;
1311
1312                         if (copy_devnodes(arg_directory) < 0)
1313                                 goto child_fail;
1314
1315                         dev_setup(arg_directory);
1316
1317                         if (setup_dev_console(arg_directory, console) < 0)
1318                                 goto child_fail;
1319
1320                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1321                                 goto child_fail;
1322
1323                         close_nointr_nofail(kmsg_socket_pair[1]);
1324                         kmsg_socket_pair[1] = -1;
1325
1326                         if (setup_boot_id(arg_directory) < 0)
1327                                 goto child_fail;
1328
1329                         if (setup_timezone(arg_directory) < 0)
1330                                 goto child_fail;
1331
1332                         if (setup_resolv_conf(arg_directory) < 0)
1333                                 goto child_fail;
1334
1335                         if (setup_journal(arg_directory) < 0)
1336                                 goto child_fail;
1337
1338                         if (chdir(arg_directory) < 0) {
1339                                 log_error("chdir(%s) failed: %m", arg_directory);
1340                                 goto child_fail;
1341                         }
1342
1343                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1344                                 log_error("mount(MS_MOVE) failed: %m");
1345                                 goto child_fail;
1346                         }
1347
1348                         if (chroot(".") < 0) {
1349                                 log_error("chroot() failed: %m");
1350                                 goto child_fail;
1351                         }
1352
1353                         if (chdir("/") < 0) {
1354                                 log_error("chdir() failed: %m");
1355                                 goto child_fail;
1356                         }
1357
1358                         umask(0022);
1359
1360                         loopback_setup();
1361
1362                         if (drop_capabilities() < 0) {
1363                                 log_error("drop_capabilities() failed: %m");
1364                                 goto child_fail;
1365                         }
1366
1367                         if (arg_user) {
1368
1369                                 /* Note that this resolves user names
1370                                  * inside the container, and hence
1371                                  * accesses the NSS modules from the
1372                                  * container and not the host. This is
1373                                  * a bit weird... */
1374
1375                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1376                                         log_error("get_user_creds() failed: %m");
1377                                         goto child_fail;
1378                                 }
1379
1380                                 if (mkdir_parents_label(home, 0775) < 0) {
1381                                         log_error("mkdir_parents_label() failed: %m");
1382                                         goto child_fail;
1383                                 }
1384
1385                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1386                                         log_error("mkdir_safe_label() failed: %m");
1387                                         goto child_fail;
1388                                 }
1389
1390                                 if (initgroups((const char*)arg_user, gid) < 0) {
1391                                         log_error("initgroups() failed: %m");
1392                                         goto child_fail;
1393                                 }
1394
1395                                 if (setresgid(gid, gid, gid) < 0) {
1396                                         log_error("setregid() failed: %m");
1397                                         goto child_fail;
1398                                 }
1399
1400                                 if (setresuid(uid, uid, uid) < 0) {
1401                                         log_error("setreuid() failed: %m");
1402                                         goto child_fail;
1403                                 }
1404                         } else {
1405                                 /* Reset everything fully to 0, just in case */
1406
1407                                 if (setgroups(0, NULL) < 0) {
1408                                         log_error("setgroups() failed: %m");
1409                                         goto child_fail;
1410                                 }
1411
1412                                 if (setresgid(0, 0, 0) < 0) {
1413                                         log_error("setregid() failed: %m");
1414                                         goto child_fail;
1415                                 }
1416
1417                                 if (setresuid(0, 0, 0) < 0) {
1418                                         log_error("setreuid() failed: %m");
1419                                         goto child_fail;
1420                                 }
1421                         }
1422
1423                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1424                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1425                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1426                                 log_oom();
1427                                 goto child_fail;
1428                         }
1429
1430                         if (arg_uuid) {
1431                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1432                                         log_oom();
1433                                         goto child_fail;
1434                                 }
1435                         }
1436
1437                         if (fdset_size(fds) > 0) {
1438                                 k = fdset_cloexec(fds, false);
1439                                 if (k < 0) {
1440                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1441                                         goto child_fail;
1442                                 }
1443
1444                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1445                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1446                                         log_oom();
1447                                         goto child_fail;
1448                                 }
1449                         }
1450
1451                         setup_hostname();
1452
1453                         if (arg_boot) {
1454                                 char **a;
1455                                 size_t l;
1456
1457                                 /* Automatically search for the init system */
1458
1459                                 l = 1 + argc - optind;
1460                                 a = newa(char*, l + 1);
1461                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1462
1463                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1464                                 execve(a[0], a, (char**) envp);
1465
1466                                 a[0] = (char*) "/lib/systemd/systemd";
1467                                 execve(a[0], a, (char**) envp);
1468
1469                                 a[0] = (char*) "/sbin/init";
1470                                 execve(a[0], a, (char**) envp);
1471                         } else if (argc > optind)
1472                                 execvpe(argv[optind], argv + optind, (char**) envp);
1473                         else {
1474                                 chdir(home ? home : "/root");
1475                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1476                         }
1477
1478                         log_error("execv() failed: %m");
1479
1480                 child_fail:
1481                         _exit(EXIT_FAILURE);
1482                 }
1483
1484                 fdset_free(fds);
1485                 fds = NULL;
1486
1487                 if (process_pty(master, pid, &mask) < 0)
1488                         goto finish;
1489
1490                 if (saved_attr_valid)
1491                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1492
1493                 r = wait_for_terminate(pid, &status);
1494                 if (r < 0) {
1495                         r = EXIT_FAILURE;
1496                         break;
1497                 }
1498
1499                 if (status.si_code == CLD_EXITED) {
1500                         if (status.si_status != 0) {
1501                                 log_error("Container failed with error code %i.", status.si_status);
1502                                 r = status.si_status;
1503                                 break;
1504                         }
1505
1506                         log_debug("Container exited successfully.");
1507                         break;
1508                 } else if (status.si_code == CLD_KILLED &&
1509                            status.si_status == SIGINT) {
1510                         log_info("Container has been shut down.");
1511                         r = 0;
1512                         break;
1513                 } else if (status.si_code == CLD_KILLED &&
1514                            status.si_status == SIGHUP) {
1515                         log_info("Container is being rebooted.");
1516                         continue;
1517                 } else if (status.si_code == CLD_KILLED ||
1518                            status.si_code == CLD_DUMPED) {
1519
1520                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1521                         r = EXIT_FAILURE;
1522                         break;
1523                 } else {
1524                         log_error("Container failed due to unknown reason.");
1525                         r = EXIT_FAILURE;
1526                         break;
1527                 }
1528         }
1529
1530 finish:
1531         if (saved_attr_valid)
1532                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1533
1534         if (master >= 0)
1535                 close_nointr_nofail(master);
1536
1537         close_pipe(kmsg_socket_pair);
1538
1539         if (oldcg)
1540                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1541
1542         if (newcg)
1543                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1544
1545         free(arg_directory);
1546         strv_free(arg_controllers);
1547         free(oldcg);
1548         free(newcg);
1549
1550         fdset_free(fds);
1551
1552         return r;
1553 }