chiark / gitweb /
honor SELinux labels, when creating and writing config files
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60 #include "fileio.h"
61
62 typedef enum LinkJournal {
63         LINK_NO,
64         LINK_AUTO,
65         LINK_HOST,
66         LINK_GUEST
67 } LinkJournal;
68
69 static char *arg_directory = NULL;
70 static char *arg_user = NULL;
71 static char **arg_controllers = NULL;
72 static char *arg_uuid = NULL;
73 static bool arg_private_network = false;
74 static bool arg_read_only = false;
75 static bool arg_boot = false;
76 static LinkJournal arg_link_journal = LINK_AUTO;
77 static uint64_t arg_retain =
78         (1ULL << CAP_CHOWN) |
79         (1ULL << CAP_DAC_OVERRIDE) |
80         (1ULL << CAP_DAC_READ_SEARCH) |
81         (1ULL << CAP_FOWNER) |
82         (1ULL << CAP_FSETID) |
83         (1ULL << CAP_IPC_OWNER) |
84         (1ULL << CAP_KILL) |
85         (1ULL << CAP_LEASE) |
86         (1ULL << CAP_LINUX_IMMUTABLE) |
87         (1ULL << CAP_NET_BIND_SERVICE) |
88         (1ULL << CAP_NET_BROADCAST) |
89         (1ULL << CAP_NET_RAW) |
90         (1ULL << CAP_SETGID) |
91         (1ULL << CAP_SETFCAP) |
92         (1ULL << CAP_SETPCAP) |
93         (1ULL << CAP_SETUID) |
94         (1ULL << CAP_SYS_ADMIN) |
95         (1ULL << CAP_SYS_CHROOT) |
96         (1ULL << CAP_SYS_NICE) |
97         (1ULL << CAP_SYS_PTRACE) |
98         (1ULL << CAP_SYS_TTY_CONFIG) |
99         (1ULL << CAP_SYS_RESOURCE) |
100         (1ULL << CAP_SYS_BOOT) |
101         (1ULL << CAP_AUDIT_WRITE) |
102         (1ULL << CAP_AUDIT_CONTROL);
103
104 static int help(void) {
105
106         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
107                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
108                "  -h --help               Show this help\n"
109                "  --version               Print version string\n"
110                "  -D --directory=NAME     Root directory for the container\n"
111                "  -b --boot               Boot up full system (i.e. invoke init)\n"
112                "  -u --user=USER          Run the command under specified user or uid\n"
113                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
114                "     --uuid=UUID          Set a specific machine UUID for the container\n"
115                "     --private-network    Disable network in container\n"
116                "     --read-only          Mount the root directory read-only\n"
117                "     --capability=CAP     In addition to the default, retain specified capability\n"
118                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
119                "  -j                      Equivalent to --link-journal=host\n",
120                program_invocation_short_name);
121
122         return 0;
123 }
124
125 static int parse_argv(int argc, char *argv[]) {
126
127         enum {
128                 ARG_VERSION = 0x100,
129                 ARG_PRIVATE_NETWORK,
130                 ARG_UUID,
131                 ARG_READ_ONLY,
132                 ARG_CAPABILITY,
133                 ARG_LINK_JOURNAL
134         };
135
136         static const struct option options[] = {
137                 { "help",            no_argument,       NULL, 'h'                 },
138                 { "version",         no_argument,       NULL, ARG_VERSION         },
139                 { "directory",       required_argument, NULL, 'D'                 },
140                 { "user",            required_argument, NULL, 'u'                 },
141                 { "controllers",     required_argument, NULL, 'C'                 },
142                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
143                 { "boot",            no_argument,       NULL, 'b'                 },
144                 { "uuid",            required_argument, NULL, ARG_UUID            },
145                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
146                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
147                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
148                 { NULL,              0,                 NULL, 0                   }
149         };
150
151         int c;
152
153         assert(argc >= 0);
154         assert(argv);
155
156         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
157
158                 switch (c) {
159
160                 case 'h':
161                         help();
162                         return 0;
163
164                 case ARG_VERSION:
165                         puts(PACKAGE_STRING);
166                         puts(SYSTEMD_FEATURES);
167                         return 0;
168
169                 case 'D':
170                         free(arg_directory);
171                         arg_directory = canonicalize_file_name(optarg);
172                         if (!arg_directory) {
173                                 log_error("Failed to canonicalize root directory.");
174                                 return -ENOMEM;
175                         }
176
177                         break;
178
179                 case 'u':
180                         free(arg_user);
181                         if (!(arg_user = strdup(optarg))) {
182                                 log_error("Failed to duplicate user name.");
183                                 return -ENOMEM;
184                         }
185
186                         break;
187
188                 case 'C':
189                         strv_free(arg_controllers);
190                         arg_controllers = strv_split(optarg, ",");
191                         if (!arg_controllers) {
192                                 log_error("Failed to split controllers list.");
193                                 return -ENOMEM;
194                         }
195                         strv_uniq(arg_controllers);
196
197                         break;
198
199                 case ARG_PRIVATE_NETWORK:
200                         arg_private_network = true;
201                         break;
202
203                 case 'b':
204                         arg_boot = true;
205                         break;
206
207                 case ARG_UUID:
208                         arg_uuid = optarg;
209                         break;
210
211                 case ARG_READ_ONLY:
212                         arg_read_only = true;
213                         break;
214
215                 case ARG_CAPABILITY: {
216                         char *state, *word;
217                         size_t length;
218
219                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
220                                 cap_value_t cap;
221                                 char *t;
222
223                                 t = strndup(word, length);
224                                 if (!t)
225                                         return log_oom();
226
227                                 if (cap_from_name(t, &cap) < 0) {
228                                         log_error("Failed to parse capability %s.", t);
229                                         free(t);
230                                         return -EINVAL;
231                                 }
232
233                                 free(t);
234                                 arg_retain |= 1ULL << (uint64_t) cap;
235                         }
236
237                         break;
238                 }
239
240                 case 'j':
241                         arg_link_journal = LINK_GUEST;
242                         break;
243
244                 case ARG_LINK_JOURNAL:
245                         if (streq(optarg, "auto"))
246                                 arg_link_journal = LINK_AUTO;
247                         else if (streq(optarg, "no"))
248                                 arg_link_journal = LINK_NO;
249                         else if (streq(optarg, "guest"))
250                                 arg_link_journal = LINK_GUEST;
251                         else if (streq(optarg, "host"))
252                                 arg_link_journal = LINK_HOST;
253                         else {
254                                 log_error("Failed to parse link journal mode %s", optarg);
255                                 return -EINVAL;
256                         }
257
258                         break;
259
260                 case '?':
261                         return -EINVAL;
262
263                 default:
264                         log_error("Unknown option code %c", c);
265                         return -EINVAL;
266                 }
267         }
268
269         return 1;
270 }
271
272 static int mount_all(const char *dest) {
273
274         typedef struct MountPoint {
275                 const char *what;
276                 const char *where;
277                 const char *type;
278                 const char *options;
279                 unsigned long flags;
280                 bool fatal;
281         } MountPoint;
282
283         static const MountPoint mount_table[] = {
284                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
285                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
286                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
287                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
288                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
289                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
290                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
291                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
292 #ifdef HAVE_SELINUX
293                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
294                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
295 #endif
296         };
297
298         unsigned k;
299         int r = 0;
300
301         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
302                 char _cleanup_free_ *where = NULL;
303                 int t;
304
305                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
306                         log_oom();
307
308                         if (r == 0)
309                                 r = -ENOMEM;
310
311                         break;
312                 }
313
314                 t = path_is_mount_point(where, true);
315                 if (t < 0) {
316                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
317
318                         if (r == 0)
319                                 r = t;
320
321                         continue;
322                 }
323
324                 /* Skip this entry if it is not a remount. */
325                 if (mount_table[k].what && t > 0)
326                         continue;
327
328                 mkdir_p_label(where, 0755);
329
330                 if (mount(mount_table[k].what,
331                           where,
332                           mount_table[k].type,
333                           mount_table[k].flags,
334                           mount_table[k].options) < 0 &&
335                     mount_table[k].fatal) {
336
337                         log_error("mount(%s) failed: %m", where);
338
339                         if (r == 0)
340                                 r = -errno;
341                 }
342         }
343
344         return r;
345 }
346
347 static int setup_timezone(const char *dest) {
348         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
349         char *z, *y;
350         int r;
351
352         assert(dest);
353
354         /* Fix the timezone, if possible */
355         r = readlink_malloc("/etc/localtime", &p);
356         if (r < 0) {
357                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
358                 return 0;
359         }
360
361         z = path_startswith(p, "../usr/share/zoneinfo/");
362         if (!z)
363                 z = path_startswith(p, "/usr/share/zoneinfo/");
364         if (!z) {
365                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
366                 return 0;
367         }
368
369         where = strappend(dest, "/etc/localtime");
370         if (!where)
371                 return log_oom();
372
373         r = readlink_malloc(where, &q);
374         if (r >= 0) {
375                 y = path_startswith(q, "../usr/share/zoneinfo/");
376                 if (!y)
377                         y = path_startswith(q, "/usr/share/zoneinfo/");
378
379
380                 /* Already pointing to the right place? Then do nothing .. */
381                 if (y && streq(y, z))
382                         return 0;
383         }
384
385         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
386         if (!check)
387                 return log_oom();
388
389         if (access(check, F_OK) < 0) {
390                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
391                 return 0;
392         }
393
394         what = strappend("../usr/share/zoneinfo/", z);
395         if (!what)
396                 return log_oom();
397
398         unlink(where);
399         if (symlink(what, where) < 0) {
400                 log_error("Failed to correct timezone of container: %m");
401                 return 0;
402         }
403
404         return 0;
405 }
406
407 static int setup_resolv_conf(const char *dest) {
408         char *where;
409
410         assert(dest);
411
412         if (arg_private_network)
413                 return 0;
414
415         /* Fix resolv.conf, if possible */
416         where = strappend(dest, "/etc/resolv.conf");
417         if (!where)
418                 return log_oom();
419
420         /* We don't really care for the results of this really. If it
421          * fails, it fails, but meh... */
422         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
423                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
424
425         free(where);
426
427         return 0;
428 }
429
430 static int setup_boot_id(const char *dest) {
431         char _cleanup_free_ *from = NULL, *to = NULL;
432         sd_id128_t rnd;
433         char as_uuid[37];
434         int r;
435
436         assert(dest);
437
438         /* Generate a new randomized boot ID, so that each boot-up of
439          * the container gets a new one */
440
441         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
442         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
443         if (!from || !to)
444                 return log_oom();
445
446         r = sd_id128_randomize(&rnd);
447         if (r < 0) {
448                 log_error("Failed to generate random boot id: %s", strerror(-r));
449                 return r;
450         }
451
452         snprintf(as_uuid, sizeof(as_uuid),
453                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
454                  SD_ID128_FORMAT_VAL(rnd));
455         char_array_0(as_uuid);
456
457         r = write_one_line_file(from, as_uuid);
458         if (r < 0) {
459                 log_error("Failed to write boot id: %s", strerror(-r));
460                 return r;
461         }
462
463         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
464                 log_error("Failed to bind mount boot id: %m");
465                 r = -errno;
466         } else
467                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
468
469         unlink(from);
470         return r;
471 }
472
473 static int copy_devnodes(const char *dest) {
474
475         static const char devnodes[] =
476                 "null\0"
477                 "zero\0"
478                 "full\0"
479                 "random\0"
480                 "urandom\0"
481                 "tty\0"
482                 "ptmx\0";
483
484         const char *d;
485         int r = 0;
486         mode_t _cleanup_umask_ u;
487
488         assert(dest);
489
490         u = umask(0000);
491
492         NULSTR_FOREACH(d, devnodes) {
493                 struct stat st;
494                 char _cleanup_free_ *from = NULL, *to = NULL;
495
496                 asprintf(&from, "/dev/%s", d);
497                 asprintf(&to, "%s/dev/%s", dest, d);
498
499                 if (!from || !to) {
500                         log_oom();
501
502                         if (r == 0)
503                                 r = -ENOMEM;
504
505                         break;
506                 }
507
508                 if (stat(from, &st) < 0) {
509
510                         if (errno != ENOENT) {
511                                 log_error("Failed to stat %s: %m", from);
512                                 if (r == 0)
513                                         r = -errno;
514                         }
515
516                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
517
518                         log_error("%s is not a char or block device, cannot copy", from);
519                         if (r == 0)
520                                 r = -EIO;
521
522                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
523
524                         log_error("mknod(%s) failed: %m", dest);
525                         if (r == 0)
526                                 r = -errno;
527                 }
528         }
529
530         return r;
531 }
532
533 static int setup_dev_console(const char *dest, const char *console) {
534         struct stat st;
535         char _cleanup_free_ *to = NULL;
536         int r;
537         mode_t _cleanup_umask_ u;
538
539         assert(dest);
540         assert(console);
541
542         u = umask(0000);
543
544         if (stat(console, &st) < 0) {
545                 log_error("Failed to stat %s: %m", console);
546                 return -errno;
547
548         } else if (!S_ISCHR(st.st_mode)) {
549                 log_error("/dev/console is not a char device");
550                 return -EIO;
551         }
552
553         r = chmod_and_chown(console, 0600, 0, 0);
554         if (r < 0) {
555                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
556                 return r;
557         }
558
559         if (asprintf(&to, "%s/dev/console", dest) < 0)
560                 return log_oom();
561
562         /* We need to bind mount the right tty to /dev/console since
563          * ptys can only exist on pts file systems. To have something
564          * to bind mount things on we create a device node first, that
565          * has the right major/minor (note that the major minor
566          * doesn't actually matter here, since we mount it over
567          * anyway). */
568
569         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
570                 log_error("mknod() for /dev/console failed: %m");
571                 return -errno;
572         }
573
574         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
575                 log_error("Bind mount for /dev/console failed: %m");
576                 return -errno;
577         }
578
579         return 0;
580 }
581
582 static int setup_kmsg(const char *dest, int kmsg_socket) {
583         char _cleanup_free_ *from = NULL, *to = NULL;
584         int r, fd, k;
585         mode_t _cleanup_umask_ u;
586         union {
587                 struct cmsghdr cmsghdr;
588                 uint8_t buf[CMSG_SPACE(sizeof(int))];
589         } control;
590         struct msghdr mh;
591         struct cmsghdr *cmsg;
592
593         assert(dest);
594         assert(kmsg_socket >= 0);
595
596         u = umask(0000);
597
598         /* We create the kmsg FIFO as /dev/kmsg, but immediately
599          * delete it after bind mounting it to /proc/kmsg. While FIFOs
600          * on the reading side behave very similar to /proc/kmsg,
601          * their writing side behaves differently from /dev/kmsg in
602          * that writing blocks when nothing is reading. In order to
603          * avoid any problems with containers deadlocking due to this
604          * we simply make /dev/kmsg unavailable to the container. */
605         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
606             asprintf(&to, "%s/proc/kmsg", dest) < 0)
607                 return log_oom();
608
609         if (mkfifo(from, 0600) < 0) {
610                 log_error("mkfifo() for /dev/kmsg failed: %m");
611                 return -errno;
612         }
613
614         r = chmod_and_chown(from, 0600, 0, 0);
615         if (r < 0) {
616                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
617                 return r;
618         }
619
620         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
621                 log_error("Bind mount for /proc/kmsg failed: %m");
622                 return -errno;
623         }
624
625         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
626         if (fd < 0) {
627                 log_error("Failed to open fifo: %m");
628                 return -errno;
629         }
630
631         zero(mh);
632         zero(control);
633
634         mh.msg_control = &control;
635         mh.msg_controllen = sizeof(control);
636
637         cmsg = CMSG_FIRSTHDR(&mh);
638         cmsg->cmsg_level = SOL_SOCKET;
639         cmsg->cmsg_type = SCM_RIGHTS;
640         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
641         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
642
643         mh.msg_controllen = cmsg->cmsg_len;
644
645         /* Store away the fd in the socket, so that it stays open as
646          * long as we run the child */
647         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
648         close_nointr_nofail(fd);
649
650         if (k < 0) {
651                 log_error("Failed to send FIFO fd: %m");
652                 return -errno;
653         }
654
655         /* And now make the FIFO unavailable as /dev/kmsg... */
656         unlink(from);
657         return 0;
658 }
659
660 static int setup_hostname(void) {
661         char *hn;
662         int r = 0;
663
664         hn = path_get_file_name(arg_directory);
665         if (hn) {
666                 hn = strdup(hn);
667                 if (!hn)
668                         return -ENOMEM;
669
670                 hostname_cleanup(hn);
671
672                 if (!isempty(hn))
673                         if (sethostname(hn, strlen(hn)) < 0)
674                                 r = -errno;
675
676                 free(hn);
677         }
678
679         return r;
680 }
681
682 static int setup_journal(const char *directory) {
683         sd_id128_t machine_id;
684         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
685         char *id;
686         int r;
687
688         if (arg_link_journal == LINK_NO)
689                 return 0;
690
691         p = strappend(directory, "/etc/machine-id");
692         if (!p)
693                 return log_oom();
694
695         r = read_one_line_file(p, &b);
696         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
697                 return 0;
698         else if (r < 0) {
699                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
700                 return r;
701         }
702
703         id = strstrip(b);
704         if (isempty(id) && arg_link_journal == LINK_AUTO)
705                 return 0;
706
707         /* Verify validity */
708         r = sd_id128_from_string(id, &machine_id);
709         if (r < 0) {
710                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
711                 return r;
712         }
713
714         free(p);
715         p = strappend("/var/log/journal/", id);
716         q = strjoin(directory, "/var/log/journal/", id, NULL);
717         if (!p || !q)
718                 return log_oom();
719
720         if (path_is_mount_point(p, false) > 0) {
721                 if (arg_link_journal != LINK_AUTO) {
722                         log_error("%s: already a mount point, refusing to use for journal", p);
723                         return -EEXIST;
724                 }
725
726                 return 0;
727         }
728
729         if (path_is_mount_point(q, false) > 0) {
730                 if (arg_link_journal != LINK_AUTO) {
731                         log_error("%s: already a mount point, refusing to use for journal", q);
732                         return -EEXIST;
733                 }
734
735                 return 0;
736         }
737
738         r = readlink_and_make_absolute(p, &d);
739         if (r >= 0) {
740                 if ((arg_link_journal == LINK_GUEST ||
741                      arg_link_journal == LINK_AUTO) &&
742                     path_equal(d, q)) {
743
744                         r = mkdir_p(q, 0755);
745                         if (r < 0)
746                                 log_warning("failed to create directory %s: %m", q);
747                         return 0;
748                 }
749
750                 if (unlink(p) < 0) {
751                         log_error("Failed to remove symlink %s: %m", p);
752                         return -errno;
753                 }
754         } else if (r == -EINVAL) {
755
756                 if (arg_link_journal == LINK_GUEST &&
757                     rmdir(p) < 0) {
758
759                         if (errno == ENOTDIR) {
760                                 log_error("%s already exists and is neither a symlink nor a directory", p);
761                                 return r;
762                         } else {
763                                 log_error("Failed to remove %s: %m", p);
764                                 return -errno;
765                         }
766                 }
767         } else if (r != -ENOENT) {
768                 log_error("readlink(%s) failed: %m", p);
769                 return r;
770         }
771
772         if (arg_link_journal == LINK_GUEST) {
773
774                 if (symlink(q, p) < 0) {
775                         log_error("Failed to symlink %s to %s: %m", q, p);
776                         return -errno;
777                 }
778
779                 r = mkdir_p(q, 0755);
780                 if (r < 0)
781                         log_warning("failed to create directory %s: %m", q);
782                 return 0;
783         }
784
785         if (arg_link_journal == LINK_HOST) {
786                 r = mkdir_p(p, 0755);
787                 if (r < 0) {
788                         log_error("Failed to create %s: %m", p);
789                         return r;
790                 }
791
792         } else if (access(p, F_OK) < 0)
793                 return 0;
794
795         if (dir_is_empty(q) == 0) {
796                 log_error("%s not empty.", q);
797                 return -ENOTEMPTY;
798         }
799
800         r = mkdir_p(q, 0755);
801         if (r < 0) {
802                 log_error("Failed to create %s: %m", q);
803                 return r;
804         }
805
806         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
807                 log_error("Failed to bind mount journal from host into guest: %m");
808                 return -errno;
809         }
810
811         return 0;
812 }
813
814 static int drop_capabilities(void) {
815         return capability_bounding_set_drop(~arg_retain, false);
816 }
817
818 static int is_os_tree(const char *path) {
819         int r;
820         char *p;
821         /* We use /bin/sh as flag file if something is an OS */
822
823         if (asprintf(&p, "%s/bin/sh", path) < 0)
824                 return -ENOMEM;
825
826         r = access(p, F_OK);
827         free(p);
828
829         return r < 0 ? 0 : 1;
830 }
831
832 static int process_pty(int master, pid_t pid, sigset_t *mask) {
833
834         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
835         size_t in_buffer_full = 0, out_buffer_full = 0;
836         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
837         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
838         int ep = -1, signal_fd = -1, r;
839         bool tried_orderly_shutdown = false;
840
841         assert(master >= 0);
842         assert(pid > 0);
843         assert(mask);
844
845         fd_nonblock(STDIN_FILENO, 1);
846         fd_nonblock(STDOUT_FILENO, 1);
847         fd_nonblock(master, 1);
848
849         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
850         if (signal_fd < 0) {
851                 log_error("signalfd(): %m");
852                 r = -errno;
853                 goto finish;
854         }
855
856         ep = epoll_create1(EPOLL_CLOEXEC);
857         if (ep < 0) {
858                 log_error("Failed to create epoll: %m");
859                 r = -errno;
860                 goto finish;
861         }
862
863         /* We read from STDIN only if this is actually a TTY,
864          * otherwise we assume non-interactivity. */
865         if (isatty(STDIN_FILENO)) {
866                 zero(stdin_ev);
867                 stdin_ev.events = EPOLLIN|EPOLLET;
868                 stdin_ev.data.fd = STDIN_FILENO;
869
870                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
871                         log_error("Failed to register STDIN in epoll: %m");
872                         r = -errno;
873                         goto finish;
874                 }
875         }
876
877         zero(stdout_ev);
878         stdout_ev.events = EPOLLOUT|EPOLLET;
879         stdout_ev.data.fd = STDOUT_FILENO;
880
881         zero(master_ev);
882         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
883         master_ev.data.fd = master;
884
885         zero(signal_ev);
886         signal_ev.events = EPOLLIN;
887         signal_ev.data.fd = signal_fd;
888
889         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
890                 if (errno != EPERM) {
891                         log_error("Failed to register stdout in epoll: %m");
892                         r = -errno;
893                         goto finish;
894                 }
895                 /* stdout without epoll support. Likely redirected to regular file. */
896                 stdout_writable = true;
897         }
898
899         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
900             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
901                 log_error("Failed to register fds in epoll: %m");
902                 r = -errno;
903                 goto finish;
904         }
905
906         for (;;) {
907                 struct epoll_event ev[16];
908                 ssize_t k;
909                 int i, nfds;
910
911                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
912                 if (nfds < 0) {
913
914                         if (errno == EINTR || errno == EAGAIN)
915                                 continue;
916
917                         log_error("epoll_wait(): %m");
918                         r = -errno;
919                         goto finish;
920                 }
921
922                 assert(nfds >= 1);
923
924                 for (i = 0; i < nfds; i++) {
925                         if (ev[i].data.fd == STDIN_FILENO) {
926
927                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
928                                         stdin_readable = true;
929
930                         } else if (ev[i].data.fd == STDOUT_FILENO) {
931
932                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
933                                         stdout_writable = true;
934
935                         } else if (ev[i].data.fd == master) {
936
937                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
938                                         master_readable = true;
939
940                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
941                                         master_writable = true;
942
943                         } else if (ev[i].data.fd == signal_fd) {
944                                 struct signalfd_siginfo sfsi;
945                                 ssize_t n;
946
947                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
948                                 if (n != sizeof(sfsi)) {
949
950                                         if (n >= 0) {
951                                                 log_error("Failed to read from signalfd: invalid block size");
952                                                 r = -EIO;
953                                                 goto finish;
954                                         }
955
956                                         if (errno != EINTR && errno != EAGAIN) {
957                                                 log_error("Failed to read from signalfd: %m");
958                                                 r = -errno;
959                                                 goto finish;
960                                         }
961                                 } else {
962
963                                         if (sfsi.ssi_signo == SIGWINCH) {
964                                                 struct winsize ws;
965
966                                                 /* The window size changed, let's forward that. */
967                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
968                                                         ioctl(master, TIOCSWINSZ, &ws);
969                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
970
971                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
972
973                                                 /* This only works for systemd... */
974                                                 tried_orderly_shutdown = true;
975                                                 kill(pid, SIGRTMIN+3);
976
977                                         } else {
978                                                 r = 0;
979                                                 goto finish;
980                                         }
981                                 }
982                         }
983                 }
984
985                 while ((stdin_readable && in_buffer_full <= 0) ||
986                        (master_writable && in_buffer_full > 0) ||
987                        (master_readable && out_buffer_full <= 0) ||
988                        (stdout_writable && out_buffer_full > 0)) {
989
990                         if (stdin_readable && in_buffer_full < LINE_MAX) {
991
992                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
993                                 if (k < 0) {
994
995                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
996                                                 stdin_readable = false;
997                                         else {
998                                                 log_error("read(): %m");
999                                                 r = -errno;
1000                                                 goto finish;
1001                                         }
1002                                 } else
1003                                         in_buffer_full += (size_t) k;
1004                         }
1005
1006                         if (master_writable && in_buffer_full > 0) {
1007
1008                                 k = write(master, in_buffer, in_buffer_full);
1009                                 if (k < 0) {
1010
1011                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1012                                                 master_writable = false;
1013                                         else {
1014                                                 log_error("write(): %m");
1015                                                 r = -errno;
1016                                                 goto finish;
1017                                         }
1018
1019                                 } else {
1020                                         assert(in_buffer_full >= (size_t) k);
1021                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1022                                         in_buffer_full -= k;
1023                                 }
1024                         }
1025
1026                         if (master_readable && out_buffer_full < LINE_MAX) {
1027
1028                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1029                                 if (k < 0) {
1030
1031                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1032                                                 master_readable = false;
1033                                         else {
1034                                                 log_error("read(): %m");
1035                                                 r = -errno;
1036                                                 goto finish;
1037                                         }
1038                                 }  else
1039                                         out_buffer_full += (size_t) k;
1040                         }
1041
1042                         if (stdout_writable && out_buffer_full > 0) {
1043
1044                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1045                                 if (k < 0) {
1046
1047                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1048                                                 stdout_writable = false;
1049                                         else {
1050                                                 log_error("write(): %m");
1051                                                 r = -errno;
1052                                                 goto finish;
1053                                         }
1054
1055                                 } else {
1056                                         assert(out_buffer_full >= (size_t) k);
1057                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1058                                         out_buffer_full -= k;
1059                                 }
1060                         }
1061                 }
1062         }
1063
1064 finish:
1065         if (ep >= 0)
1066                 close_nointr_nofail(ep);
1067
1068         if (signal_fd >= 0)
1069                 close_nointr_nofail(signal_fd);
1070
1071         return r;
1072 }
1073
1074 int main(int argc, char *argv[]) {
1075         pid_t pid = 0;
1076         int r = EXIT_FAILURE, k;
1077         char *oldcg = NULL, *newcg = NULL;
1078         char **controller = NULL;
1079         int master = -1, n_fd_passed;
1080         const char *console = NULL;
1081         struct termios saved_attr, raw_attr;
1082         sigset_t mask;
1083         bool saved_attr_valid = false;
1084         struct winsize ws;
1085         int kmsg_socket_pair[2] = { -1, -1 };
1086         FDSet *fds = NULL;
1087
1088         log_parse_environment();
1089         log_open();
1090
1091         r = parse_argv(argc, argv);
1092         if (r <= 0)
1093                 goto finish;
1094
1095         if (arg_directory) {
1096                 char *p;
1097
1098                 p = path_make_absolute_cwd(arg_directory);
1099                 free(arg_directory);
1100                 arg_directory = p;
1101         } else
1102                 arg_directory = get_current_dir_name();
1103
1104         if (!arg_directory) {
1105                 log_error("Failed to determine path");
1106                 goto finish;
1107         }
1108
1109         path_kill_slashes(arg_directory);
1110
1111         if (geteuid() != 0) {
1112                 log_error("Need to be root.");
1113                 goto finish;
1114         }
1115
1116         if (sd_booted() <= 0) {
1117                 log_error("Not running on a systemd system.");
1118                 goto finish;
1119         }
1120
1121         if (path_equal(arg_directory, "/")) {
1122                 log_error("Spawning container on root directory not supported.");
1123                 goto finish;
1124         }
1125
1126         if (is_os_tree(arg_directory) <= 0) {
1127                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1128                 goto finish;
1129         }
1130
1131         log_close();
1132         n_fd_passed = sd_listen_fds(false);
1133         if (n_fd_passed > 0) {
1134                 k = fdset_new_listen_fds(&fds, false);
1135                 if (k < 0) {
1136                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1137                         goto finish;
1138                 }
1139         }
1140         fdset_close_others(fds);
1141         log_open();
1142
1143         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1144         if (k < 0) {
1145                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1146                 goto finish;
1147         }
1148
1149         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1150                 log_error("Failed to allocate cgroup path.");
1151                 goto finish;
1152         }
1153
1154         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1155         if (k < 0)  {
1156                 log_error("Failed to create cgroup: %s", strerror(-k));
1157                 goto finish;
1158         }
1159
1160         STRV_FOREACH(controller, arg_controllers) {
1161                 k = cg_create_and_attach(*controller, newcg, 0);
1162                 if (k < 0)
1163                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1164         }
1165
1166         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1167         if (master < 0) {
1168                 log_error("Failed to acquire pseudo tty: %m");
1169                 goto finish;
1170         }
1171
1172         console = ptsname(master);
1173         if (!console) {
1174                 log_error("Failed to determine tty name: %m");
1175                 goto finish;
1176         }
1177
1178         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1179
1180         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1181                 ioctl(master, TIOCSWINSZ, &ws);
1182
1183         if (unlockpt(master) < 0) {
1184                 log_error("Failed to unlock tty: %m");
1185                 goto finish;
1186         }
1187
1188         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1189                 saved_attr_valid = true;
1190
1191                 raw_attr = saved_attr;
1192                 cfmakeraw(&raw_attr);
1193                 raw_attr.c_lflag &= ~ECHO;
1194         }
1195
1196         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1197                 log_error("Failed to create kmsg socket pair");
1198                 goto finish;
1199         }
1200
1201         assert_se(sigemptyset(&mask) == 0);
1202         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1203         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1204
1205         for (;;) {
1206                 siginfo_t status;
1207
1208                 if (saved_attr_valid) {
1209                         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1210                                 log_error("Failed to set terminal attributes: %m");
1211                                 goto finish;
1212                         }
1213                 }
1214
1215                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1216                 if (pid < 0) {
1217                         if (errno == EINVAL)
1218                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1219                         else
1220                                 log_error("clone() failed: %m");
1221
1222                         goto finish;
1223                 }
1224
1225                 if (pid == 0) {
1226                         /* child */
1227
1228                         const char *home = NULL;
1229                         uid_t uid = (uid_t) -1;
1230                         gid_t gid = (gid_t) -1;
1231                         unsigned n_env = 0;
1232                         const char *envp[] = {
1233                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1234                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1235                                 NULL, /* TERM */
1236                                 NULL, /* HOME */
1237                                 NULL, /* USER */
1238                                 NULL, /* LOGNAME */
1239                                 NULL, /* container_uuid */
1240                                 NULL, /* LISTEN_FDS */
1241                                 NULL, /* LISTEN_PID */
1242                                 NULL
1243                         };
1244
1245                         envp[2] = strv_find_prefix(environ, "TERM=");
1246                         n_env = 3;
1247
1248                         close_nointr_nofail(master);
1249                         master = -1;
1250
1251                         close_nointr(STDIN_FILENO);
1252                         close_nointr(STDOUT_FILENO);
1253                         close_nointr(STDERR_FILENO);
1254
1255                         close_nointr_nofail(kmsg_socket_pair[0]);
1256                         kmsg_socket_pair[0] = -1;
1257
1258                         reset_all_signal_handlers();
1259
1260                         assert_se(sigemptyset(&mask) == 0);
1261                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1262
1263                         k = open_terminal(console, O_RDWR);
1264                         if (k != STDIN_FILENO) {
1265                                 if (k >= 0) {
1266                                         close_nointr_nofail(k);
1267                                         k = -EINVAL;
1268                                 }
1269
1270                                 log_error("Failed to open console: %s", strerror(-k));
1271                                 goto child_fail;
1272                         }
1273
1274                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1275                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1276                                 log_error("Failed to duplicate console: %m");
1277                                 goto child_fail;
1278                         }
1279
1280                         if (setsid() < 0) {
1281                                 log_error("setsid() failed: %m");
1282                                 goto child_fail;
1283                         }
1284
1285                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1286                                 log_error("PR_SET_PDEATHSIG failed: %m");
1287                                 goto child_fail;
1288                         }
1289
1290                         /* Mark everything as slave, so that we still
1291                          * receive mounts from the real root, but don't
1292                          * propagate mounts to the real root. */
1293                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1294                                 log_error("MS_SLAVE|MS_REC failed: %m");
1295                                 goto child_fail;
1296                         }
1297
1298                         /* Turn directory into bind mount */
1299                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1300                                 log_error("Failed to make bind mount.");
1301                                 goto child_fail;
1302                         }
1303
1304                         if (arg_read_only)
1305                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1306                                         log_error("Failed to make read-only.");
1307                                         goto child_fail;
1308                                 }
1309
1310                         if (mount_all(arg_directory) < 0)
1311                                 goto child_fail;
1312
1313                         if (copy_devnodes(arg_directory) < 0)
1314                                 goto child_fail;
1315
1316                         dev_setup(arg_directory);
1317
1318                         if (setup_dev_console(arg_directory, console) < 0)
1319                                 goto child_fail;
1320
1321                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1322                                 goto child_fail;
1323
1324                         close_nointr_nofail(kmsg_socket_pair[1]);
1325                         kmsg_socket_pair[1] = -1;
1326
1327                         if (setup_boot_id(arg_directory) < 0)
1328                                 goto child_fail;
1329
1330                         if (setup_timezone(arg_directory) < 0)
1331                                 goto child_fail;
1332
1333                         if (setup_resolv_conf(arg_directory) < 0)
1334                                 goto child_fail;
1335
1336                         if (setup_journal(arg_directory) < 0)
1337                                 goto child_fail;
1338
1339                         if (chdir(arg_directory) < 0) {
1340                                 log_error("chdir(%s) failed: %m", arg_directory);
1341                                 goto child_fail;
1342                         }
1343
1344                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1345                                 log_error("mount(MS_MOVE) failed: %m");
1346                                 goto child_fail;
1347                         }
1348
1349                         if (chroot(".") < 0) {
1350                                 log_error("chroot() failed: %m");
1351                                 goto child_fail;
1352                         }
1353
1354                         if (chdir("/") < 0) {
1355                                 log_error("chdir() failed: %m");
1356                                 goto child_fail;
1357                         }
1358
1359                         umask(0022);
1360
1361                         loopback_setup();
1362
1363                         if (drop_capabilities() < 0) {
1364                                 log_error("drop_capabilities() failed: %m");
1365                                 goto child_fail;
1366                         }
1367
1368                         if (arg_user) {
1369
1370                                 /* Note that this resolves user names
1371                                  * inside the container, and hence
1372                                  * accesses the NSS modules from the
1373                                  * container and not the host. This is
1374                                  * a bit weird... */
1375
1376                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1377                                         log_error("get_user_creds() failed: %m");
1378                                         goto child_fail;
1379                                 }
1380
1381                                 if (mkdir_parents_label(home, 0775) < 0) {
1382                                         log_error("mkdir_parents_label() failed: %m");
1383                                         goto child_fail;
1384                                 }
1385
1386                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1387                                         log_error("mkdir_safe_label() failed: %m");
1388                                         goto child_fail;
1389                                 }
1390
1391                                 if (initgroups((const char*)arg_user, gid) < 0) {
1392                                         log_error("initgroups() failed: %m");
1393                                         goto child_fail;
1394                                 }
1395
1396                                 if (setresgid(gid, gid, gid) < 0) {
1397                                         log_error("setregid() failed: %m");
1398                                         goto child_fail;
1399                                 }
1400
1401                                 if (setresuid(uid, uid, uid) < 0) {
1402                                         log_error("setreuid() failed: %m");
1403                                         goto child_fail;
1404                                 }
1405                         } else {
1406                                 /* Reset everything fully to 0, just in case */
1407
1408                                 if (setgroups(0, NULL) < 0) {
1409                                         log_error("setgroups() failed: %m");
1410                                         goto child_fail;
1411                                 }
1412
1413                                 if (setresgid(0, 0, 0) < 0) {
1414                                         log_error("setregid() failed: %m");
1415                                         goto child_fail;
1416                                 }
1417
1418                                 if (setresuid(0, 0, 0) < 0) {
1419                                         log_error("setreuid() failed: %m");
1420                                         goto child_fail;
1421                                 }
1422                         }
1423
1424                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1425                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1426                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1427                                 log_oom();
1428                                 goto child_fail;
1429                         }
1430
1431                         if (arg_uuid) {
1432                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1433                                         log_oom();
1434                                         goto child_fail;
1435                                 }
1436                         }
1437
1438                         if (fdset_size(fds) > 0) {
1439                                 k = fdset_cloexec(fds, false);
1440                                 if (k < 0) {
1441                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1442                                         goto child_fail;
1443                                 }
1444
1445                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1446                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1447                                         log_oom();
1448                                         goto child_fail;
1449                                 }
1450                         }
1451
1452                         setup_hostname();
1453
1454                         if (arg_boot) {
1455                                 char **a;
1456                                 size_t l;
1457
1458                                 /* Automatically search for the init system */
1459
1460                                 l = 1 + argc - optind;
1461                                 a = newa(char*, l + 1);
1462                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1463
1464                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1465                                 execve(a[0], a, (char**) envp);
1466
1467                                 a[0] = (char*) "/lib/systemd/systemd";
1468                                 execve(a[0], a, (char**) envp);
1469
1470                                 a[0] = (char*) "/sbin/init";
1471                                 execve(a[0], a, (char**) envp);
1472                         } else if (argc > optind)
1473                                 execvpe(argv[optind], argv + optind, (char**) envp);
1474                         else {
1475                                 chdir(home ? home : "/root");
1476                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1477                         }
1478
1479                         log_error("execv() failed: %m");
1480
1481                 child_fail:
1482                         _exit(EXIT_FAILURE);
1483                 }
1484
1485                 fdset_free(fds);
1486                 fds = NULL;
1487
1488                 if (process_pty(master, pid, &mask) < 0)
1489                         goto finish;
1490
1491                 if (saved_attr_valid)
1492                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1493
1494                 r = wait_for_terminate(pid, &status);
1495                 if (r < 0) {
1496                         r = EXIT_FAILURE;
1497                         break;
1498                 }
1499
1500                 if (status.si_code == CLD_EXITED) {
1501                         if (status.si_status != 0) {
1502                                 log_error("Container failed with error code %i.", status.si_status);
1503                                 r = status.si_status;
1504                                 break;
1505                         }
1506
1507                         log_debug("Container exited successfully.");
1508                         break;
1509                 } else if (status.si_code == CLD_KILLED &&
1510                            status.si_status == SIGINT) {
1511                         log_info("Container has been shut down.");
1512                         r = 0;
1513                         break;
1514                 } else if (status.si_code == CLD_KILLED &&
1515                            status.si_status == SIGHUP) {
1516                         log_info("Container is being rebooted.");
1517                         continue;
1518                 } else if (status.si_code == CLD_KILLED ||
1519                            status.si_code == CLD_DUMPED) {
1520
1521                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1522                         r = EXIT_FAILURE;
1523                         break;
1524                 } else {
1525                         log_error("Container failed due to unknown reason.");
1526                         r = EXIT_FAILURE;
1527                         break;
1528                 }
1529         }
1530
1531 finish:
1532         if (saved_attr_valid)
1533                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1534
1535         if (master >= 0)
1536                 close_nointr_nofail(master);
1537
1538         close_pipe(kmsg_socket_pair);
1539
1540         if (oldcg)
1541                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1542
1543         if (newcg)
1544                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1545
1546         free(arg_directory);
1547         strv_free(arg_controllers);
1548         free(oldcg);
1549         free(newcg);
1550
1551         fdset_free(fds);
1552
1553         return r;
1554 }