chiark / gitweb /
job: fix merging with --ignore-dependencies
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59 #include "build.h"
60
61 typedef enum LinkJournal {
62         LINK_NO,
63         LINK_AUTO,
64         LINK_HOST,
65         LINK_GUEST
66 } LinkJournal;
67
68 static char *arg_directory = NULL;
69 static char *arg_user = NULL;
70 static char **arg_controllers = NULL;
71 static char *arg_uuid = NULL;
72 static bool arg_private_network = false;
73 static bool arg_read_only = false;
74 static bool arg_boot = false;
75 static LinkJournal arg_link_journal = LINK_AUTO;
76 static uint64_t arg_retain =
77         (1ULL << CAP_CHOWN) |
78         (1ULL << CAP_DAC_OVERRIDE) |
79         (1ULL << CAP_DAC_READ_SEARCH) |
80         (1ULL << CAP_FOWNER) |
81         (1ULL << CAP_FSETID) |
82         (1ULL << CAP_IPC_OWNER) |
83         (1ULL << CAP_KILL) |
84         (1ULL << CAP_LEASE) |
85         (1ULL << CAP_LINUX_IMMUTABLE) |
86         (1ULL << CAP_NET_BIND_SERVICE) |
87         (1ULL << CAP_NET_BROADCAST) |
88         (1ULL << CAP_NET_RAW) |
89         (1ULL << CAP_SETGID) |
90         (1ULL << CAP_SETFCAP) |
91         (1ULL << CAP_SETPCAP) |
92         (1ULL << CAP_SETUID) |
93         (1ULL << CAP_SYS_ADMIN) |
94         (1ULL << CAP_SYS_CHROOT) |
95         (1ULL << CAP_SYS_NICE) |
96         (1ULL << CAP_SYS_PTRACE) |
97         (1ULL << CAP_SYS_TTY_CONFIG) |
98         (1ULL << CAP_SYS_RESOURCE) |
99         (1ULL << CAP_SYS_BOOT) |
100         (1ULL << CAP_AUDIT_WRITE) |
101         (1ULL << CAP_AUDIT_CONTROL);
102
103 static int help(void) {
104
105         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
106                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
107                "  -h --help               Show this help\n"
108                "  --version               Print version string\n"
109                "  -D --directory=NAME     Root directory for the container\n"
110                "  -b --boot               Boot up full system (i.e. invoke init)\n"
111                "  -u --user=USER          Run the command under specified user or uid\n"
112                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
113                "     --uuid=UUID          Set a specific machine UUID for the container\n"
114                "     --private-network    Disable network in container\n"
115                "     --read-only          Mount the root directory read-only\n"
116                "     --capability=CAP     In addition to the default, retain specified capability\n"
117                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
118                "  -j                      Equivalent to --link-journal=host\n",
119                program_invocation_short_name);
120
121         return 0;
122 }
123
124 static int parse_argv(int argc, char *argv[]) {
125
126         enum {
127                 ARG_VERSION = 0x100,
128                 ARG_PRIVATE_NETWORK,
129                 ARG_UUID,
130                 ARG_READ_ONLY,
131                 ARG_CAPABILITY,
132                 ARG_LINK_JOURNAL
133         };
134
135         static const struct option options[] = {
136                 { "help",            no_argument,       NULL, 'h'                 },
137                 { "version",         no_argument,       NULL, ARG_VERSION         },
138                 { "directory",       required_argument, NULL, 'D'                 },
139                 { "user",            required_argument, NULL, 'u'                 },
140                 { "controllers",     required_argument, NULL, 'C'                 },
141                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
142                 { "boot",            no_argument,       NULL, 'b'                 },
143                 { "uuid",            required_argument, NULL, ARG_UUID            },
144                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
145                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
146                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
147                 { NULL,              0,                 NULL, 0                   }
148         };
149
150         int c;
151
152         assert(argc >= 0);
153         assert(argv);
154
155         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
156
157                 switch (c) {
158
159                 case 'h':
160                         help();
161                         return 0;
162
163                 case ARG_VERSION:
164                         puts(PACKAGE_STRING);
165                         puts(SYSTEMD_FEATURES);
166                         return 0;
167
168                 case 'D':
169                         free(arg_directory);
170                         arg_directory = canonicalize_file_name(optarg);
171                         if (!arg_directory) {
172                                 log_error("Failed to canonicalize root directory.");
173                                 return -ENOMEM;
174                         }
175
176                         break;
177
178                 case 'u':
179                         free(arg_user);
180                         if (!(arg_user = strdup(optarg))) {
181                                 log_error("Failed to duplicate user name.");
182                                 return -ENOMEM;
183                         }
184
185                         break;
186
187                 case 'C':
188                         strv_free(arg_controllers);
189                         arg_controllers = strv_split(optarg, ",");
190                         if (!arg_controllers) {
191                                 log_error("Failed to split controllers list.");
192                                 return -ENOMEM;
193                         }
194                         strv_uniq(arg_controllers);
195
196                         break;
197
198                 case ARG_PRIVATE_NETWORK:
199                         arg_private_network = true;
200                         break;
201
202                 case 'b':
203                         arg_boot = true;
204                         break;
205
206                 case ARG_UUID:
207                         arg_uuid = optarg;
208                         break;
209
210                 case ARG_READ_ONLY:
211                         arg_read_only = true;
212                         break;
213
214                 case ARG_CAPABILITY: {
215                         char *state, *word;
216                         size_t length;
217
218                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
219                                 cap_value_t cap;
220                                 char *t;
221
222                                 t = strndup(word, length);
223                                 if (!t)
224                                         return log_oom();
225
226                                 if (cap_from_name(t, &cap) < 0) {
227                                         log_error("Failed to parse capability %s.", t);
228                                         free(t);
229                                         return -EINVAL;
230                                 }
231
232                                 free(t);
233                                 arg_retain |= 1ULL << (uint64_t) cap;
234                         }
235
236                         break;
237                 }
238
239                 case 'j':
240                         arg_link_journal = LINK_GUEST;
241                         break;
242
243                 case ARG_LINK_JOURNAL:
244                         if (streq(optarg, "auto"))
245                                 arg_link_journal = LINK_AUTO;
246                         else if (streq(optarg, "no"))
247                                 arg_link_journal = LINK_NO;
248                         else if (streq(optarg, "guest"))
249                                 arg_link_journal = LINK_GUEST;
250                         else if (streq(optarg, "host"))
251                                 arg_link_journal = LINK_HOST;
252                         else {
253                                 log_error("Failed to parse link journal mode %s", optarg);
254                                 return -EINVAL;
255                         }
256
257                         break;
258
259                 case '?':
260                         return -EINVAL;
261
262                 default:
263                         log_error("Unknown option code %c", c);
264                         return -EINVAL;
265                 }
266         }
267
268         return 1;
269 }
270
271 static int mount_all(const char *dest) {
272
273         typedef struct MountPoint {
274                 const char *what;
275                 const char *where;
276                 const char *type;
277                 const char *options;
278                 unsigned long flags;
279                 bool fatal;
280         } MountPoint;
281
282         static const MountPoint mount_table[] = {
283                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
284                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
285                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
286                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
287                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
288                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
289                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
290                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
291 #ifdef HAVE_SELINUX
292                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
293                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
294 #endif
295         };
296
297         unsigned k;
298         int r = 0;
299
300         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
301                 char _cleanup_free_ *where = NULL;
302                 int t;
303
304                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
305                         log_oom();
306
307                         if (r == 0)
308                                 r = -ENOMEM;
309
310                         break;
311                 }
312
313                 t = path_is_mount_point(where, true);
314                 if (t < 0) {
315                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
316
317                         if (r == 0)
318                                 r = t;
319
320                         continue;
321                 }
322
323                 /* Skip this entry if it is not a remount. */
324                 if (mount_table[k].what && t > 0)
325                         continue;
326
327                 mkdir_p_label(where, 0755);
328
329                 if (mount(mount_table[k].what,
330                           where,
331                           mount_table[k].type,
332                           mount_table[k].flags,
333                           mount_table[k].options) < 0 &&
334                     mount_table[k].fatal) {
335
336                         log_error("mount(%s) failed: %m", where);
337
338                         if (r == 0)
339                                 r = -errno;
340                 }
341         }
342
343         return r;
344 }
345
346 static int setup_timezone(const char *dest) {
347         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
348         char *z, *y;
349         int r;
350
351         assert(dest);
352
353         /* Fix the timezone, if possible */
354         r = readlink_malloc("/etc/localtime", &p);
355         if (r < 0) {
356                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
357                 return 0;
358         }
359
360         z = path_startswith(p, "../usr/share/zoneinfo/");
361         if (!z)
362                 z = path_startswith(p, "/usr/share/zoneinfo/");
363         if (!z) {
364                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
365                 return 0;
366         }
367
368         where = strappend(dest, "/etc/localtime");
369         if (!where)
370                 return log_oom();
371
372         r = readlink_malloc(where, &q);
373         if (r >= 0) {
374                 y = path_startswith(q, "../usr/share/zoneinfo/");
375                 if (!y)
376                         y = path_startswith(q, "/usr/share/zoneinfo/");
377
378
379                 /* Already pointing to the right place? Then do nothing .. */
380                 if (y && streq(y, z))
381                         return 0;
382         }
383
384         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
385         if (!check)
386                 return log_oom();
387
388         if (access(check, F_OK) < 0) {
389                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
390                 return 0;
391         }
392
393         what = strappend("../usr/share/zoneinfo/", z);
394         if (!what)
395                 return log_oom();
396
397         unlink(where);
398         if (symlink(what, where) < 0) {
399                 log_error("Failed to correct timezone of container: %m");
400                 return 0;
401         }
402
403         return 0;
404 }
405
406 static int setup_resolv_conf(const char *dest) {
407         char *where;
408
409         assert(dest);
410
411         if (arg_private_network)
412                 return 0;
413
414         /* Fix resolv.conf, if possible */
415         where = strappend(dest, "/etc/resolv.conf");
416         if (!where)
417                 return log_oom();
418
419         /* We don't really care for the results of this really. If it
420          * fails, it fails, but meh... */
421         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
422                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
423
424         free(where);
425
426         return 0;
427 }
428
429 static int setup_boot_id(const char *dest) {
430         char _cleanup_free_ *from = NULL, *to = NULL;
431         sd_id128_t rnd;
432         char as_uuid[37];
433         int r;
434
435         assert(dest);
436
437         /* Generate a new randomized boot ID, so that each boot-up of
438          * the container gets a new one */
439
440         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
441         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
442         if (!from || !to)
443                 return log_oom();
444
445         r = sd_id128_randomize(&rnd);
446         if (r < 0) {
447                 log_error("Failed to generate random boot id: %s", strerror(-r));
448                 return r;
449         }
450
451         snprintf(as_uuid, sizeof(as_uuid),
452                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
453                  SD_ID128_FORMAT_VAL(rnd));
454         char_array_0(as_uuid);
455
456         r = write_one_line_file(from, as_uuid);
457         if (r < 0) {
458                 log_error("Failed to write boot id: %s", strerror(-r));
459                 return r;
460         }
461
462         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
463                 log_error("Failed to bind mount boot id: %m");
464                 r = -errno;
465         } else
466                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
467
468         unlink(from);
469         return r;
470 }
471
472 static int copy_devnodes(const char *dest) {
473
474         static const char devnodes[] =
475                 "null\0"
476                 "zero\0"
477                 "full\0"
478                 "random\0"
479                 "urandom\0"
480                 "tty\0"
481                 "ptmx\0";
482
483         const char *d;
484         int r = 0;
485         mode_t _cleanup_umask_ u;
486
487         assert(dest);
488
489         u = umask(0000);
490
491         NULSTR_FOREACH(d, devnodes) {
492                 struct stat st;
493                 char _cleanup_free_ *from = NULL, *to = NULL;
494
495                 asprintf(&from, "/dev/%s", d);
496                 asprintf(&to, "%s/dev/%s", dest, d);
497
498                 if (!from || !to) {
499                         log_oom();
500
501                         if (r == 0)
502                                 r = -ENOMEM;
503
504                         break;
505                 }
506
507                 if (stat(from, &st) < 0) {
508
509                         if (errno != ENOENT) {
510                                 log_error("Failed to stat %s: %m", from);
511                                 if (r == 0)
512                                         r = -errno;
513                         }
514
515                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
516
517                         log_error("%s is not a char or block device, cannot copy", from);
518                         if (r == 0)
519                                 r = -EIO;
520
521                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
522
523                         log_error("mknod(%s) failed: %m", dest);
524                         if (r == 0)
525                                 r = -errno;
526                 }
527         }
528
529         return r;
530 }
531
532 static int setup_dev_console(const char *dest, const char *console) {
533         struct stat st;
534         char _cleanup_free_ *to = NULL;
535         int r;
536         mode_t _cleanup_umask_ u;
537
538         assert(dest);
539         assert(console);
540
541         u = umask(0000);
542
543         if (stat(console, &st) < 0) {
544                 log_error("Failed to stat %s: %m", console);
545                 return -errno;
546
547         } else if (!S_ISCHR(st.st_mode)) {
548                 log_error("/dev/console is not a char device");
549                 return -EIO;
550         }
551
552         r = chmod_and_chown(console, 0600, 0, 0);
553         if (r < 0) {
554                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
555                 return r;
556         }
557
558         if (asprintf(&to, "%s/dev/console", dest) < 0)
559                 return log_oom();
560
561         /* We need to bind mount the right tty to /dev/console since
562          * ptys can only exist on pts file systems. To have something
563          * to bind mount things on we create a device node first, that
564          * has the right major/minor (note that the major minor
565          * doesn't actually matter here, since we mount it over
566          * anyway). */
567
568         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
569                 log_error("mknod() for /dev/console failed: %m");
570                 return -errno;
571         }
572
573         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
574                 log_error("Bind mount for /dev/console failed: %m");
575                 return -errno;
576         }
577
578         return 0;
579 }
580
581 static int setup_kmsg(const char *dest, int kmsg_socket) {
582         char _cleanup_free_ *from = NULL, *to = NULL;
583         int r, fd, k;
584         mode_t _cleanup_umask_ u;
585         union {
586                 struct cmsghdr cmsghdr;
587                 uint8_t buf[CMSG_SPACE(sizeof(int))];
588         } control;
589         struct msghdr mh;
590         struct cmsghdr *cmsg;
591
592         assert(dest);
593         assert(kmsg_socket >= 0);
594
595         u = umask(0000);
596
597         /* We create the kmsg FIFO as /dev/kmsg, but immediately
598          * delete it after bind mounting it to /proc/kmsg. While FIFOs
599          * on the reading side behave very similar to /proc/kmsg,
600          * their writing side behaves differently from /dev/kmsg in
601          * that writing blocks when nothing is reading. In order to
602          * avoid any problems with containers deadlocking due to this
603          * we simply make /dev/kmsg unavailable to the container. */
604         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
605             asprintf(&to, "%s/proc/kmsg", dest) < 0)
606                 return log_oom();
607
608         if (mkfifo(from, 0600) < 0) {
609                 log_error("mkfifo() for /dev/kmsg failed: %m");
610                 return -errno;
611         }
612
613         r = chmod_and_chown(from, 0600, 0, 0);
614         if (r < 0) {
615                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
616                 return r;
617         }
618
619         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
620                 log_error("Bind mount for /proc/kmsg failed: %m");
621                 return -errno;
622         }
623
624         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
625         if (fd < 0) {
626                 log_error("Failed to open fifo: %m");
627                 return -errno;
628         }
629
630         zero(mh);
631         zero(control);
632
633         mh.msg_control = &control;
634         mh.msg_controllen = sizeof(control);
635
636         cmsg = CMSG_FIRSTHDR(&mh);
637         cmsg->cmsg_level = SOL_SOCKET;
638         cmsg->cmsg_type = SCM_RIGHTS;
639         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
640         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
641
642         mh.msg_controllen = cmsg->cmsg_len;
643
644         /* Store away the fd in the socket, so that it stays open as
645          * long as we run the child */
646         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
647         close_nointr_nofail(fd);
648
649         if (k < 0) {
650                 log_error("Failed to send FIFO fd: %m");
651                 return -errno;
652         }
653
654         /* And now make the FIFO unavailable as /dev/kmsg... */
655         unlink(from);
656         return 0;
657 }
658
659 static int setup_hostname(void) {
660         char *hn;
661         int r = 0;
662
663         hn = path_get_file_name(arg_directory);
664         if (hn) {
665                 hn = strdup(hn);
666                 if (!hn)
667                         return -ENOMEM;
668
669                 hostname_cleanup(hn);
670
671                 if (!isempty(hn))
672                         if (sethostname(hn, strlen(hn)) < 0)
673                                 r = -errno;
674
675                 free(hn);
676         }
677
678         return r;
679 }
680
681 static int setup_journal(const char *directory) {
682         sd_id128_t machine_id;
683         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
684         char *id;
685         int r;
686
687         if (arg_link_journal == LINK_NO)
688                 return 0;
689
690         p = strappend(directory, "/etc/machine-id");
691         if (!p)
692                 return log_oom();
693
694         r = read_one_line_file(p, &b);
695         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
696                 return 0;
697         else if (r < 0) {
698                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
699                 return r;
700         }
701
702         id = strstrip(b);
703         if (isempty(id) && arg_link_journal == LINK_AUTO)
704                 return 0;
705
706         /* Verify validity */
707         r = sd_id128_from_string(id, &machine_id);
708         if (r < 0) {
709                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
710                 return r;
711         }
712
713         free(p);
714         p = strappend("/var/log/journal/", id);
715         q = strjoin(directory, "/var/log/journal/", id, NULL);
716         if (!p || !q)
717                 return log_oom();
718
719         if (path_is_mount_point(p, false) > 0) {
720                 if (arg_link_journal != LINK_AUTO) {
721                         log_error("%s: already a mount point, refusing to use for journal", p);
722                         return -EEXIST;
723                 }
724
725                 return 0;
726         }
727
728         if (path_is_mount_point(q, false) > 0) {
729                 if (arg_link_journal != LINK_AUTO) {
730                         log_error("%s: already a mount point, refusing to use for journal", q);
731                         return -EEXIST;
732                 }
733
734                 return 0;
735         }
736
737         r = readlink_and_make_absolute(p, &d);
738         if (r >= 0) {
739                 if ((arg_link_journal == LINK_GUEST ||
740                      arg_link_journal == LINK_AUTO) &&
741                     path_equal(d, q)) {
742
743                         r = mkdir_p(q, 0755);
744                         if (r < 0)
745                                 log_warning("failed to create directory %s: %m", q);
746                         return 0;
747                 }
748
749                 if (unlink(p) < 0) {
750                         log_error("Failed to remove symlink %s: %m", p);
751                         return -errno;
752                 }
753         } else if (r == -EINVAL) {
754
755                 if (arg_link_journal == LINK_GUEST &&
756                     rmdir(p) < 0) {
757
758                         if (errno == ENOTDIR) {
759                                 log_error("%s already exists and is neither a symlink nor a directory", p);
760                                 return r;
761                         } else {
762                                 log_error("Failed to remove %s: %m", p);
763                                 return -errno;
764                         }
765                 }
766         } else if (r != -ENOENT) {
767                 log_error("readlink(%s) failed: %m", p);
768                 return r;
769         }
770
771         if (arg_link_journal == LINK_GUEST) {
772
773                 if (symlink(q, p) < 0) {
774                         log_error("Failed to symlink %s to %s: %m", q, p);
775                         return -errno;
776                 }
777
778                 r = mkdir_p(q, 0755);
779                 if (r < 0)
780                         log_warning("failed to create directory %s: %m", q);
781                 return 0;
782         }
783
784         if (arg_link_journal == LINK_HOST) {
785                 r = mkdir_p(p, 0755);
786                 if (r < 0) {
787                         log_error("Failed to create %s: %m", p);
788                         return r;
789                 }
790
791         } else if (access(p, F_OK) < 0)
792                 return 0;
793
794         if (dir_is_empty(q) == 0) {
795                 log_error("%s not empty.", q);
796                 return -ENOTEMPTY;
797         }
798
799         r = mkdir_p(q, 0755);
800         if (r < 0) {
801                 log_error("Failed to create %s: %m", q);
802                 return r;
803         }
804
805         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
806                 log_error("Failed to bind mount journal from host into guest: %m");
807                 return -errno;
808         }
809
810         return 0;
811 }
812
813 static int drop_capabilities(void) {
814         return capability_bounding_set_drop(~arg_retain, false);
815 }
816
817 static int is_os_tree(const char *path) {
818         int r;
819         char *p;
820         /* We use /bin/sh as flag file if something is an OS */
821
822         if (asprintf(&p, "%s/bin/sh", path) < 0)
823                 return -ENOMEM;
824
825         r = access(p, F_OK);
826         free(p);
827
828         return r < 0 ? 0 : 1;
829 }
830
831 static int process_pty(int master, pid_t pid, sigset_t *mask) {
832
833         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
834         size_t in_buffer_full = 0, out_buffer_full = 0;
835         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
836         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
837         int ep = -1, signal_fd = -1, r;
838         bool tried_orderly_shutdown = false;
839
840         assert(master >= 0);
841         assert(pid > 0);
842         assert(mask);
843
844         fd_nonblock(STDIN_FILENO, 1);
845         fd_nonblock(STDOUT_FILENO, 1);
846         fd_nonblock(master, 1);
847
848         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
849         if (signal_fd < 0) {
850                 log_error("signalfd(): %m");
851                 r = -errno;
852                 goto finish;
853         }
854
855         ep = epoll_create1(EPOLL_CLOEXEC);
856         if (ep < 0) {
857                 log_error("Failed to create epoll: %m");
858                 r = -errno;
859                 goto finish;
860         }
861
862         /* We read from STDIN only if this is actually a TTY,
863          * otherwise we assume non-interactivity. */
864         if (isatty(STDIN_FILENO)) {
865                 zero(stdin_ev);
866                 stdin_ev.events = EPOLLIN|EPOLLET;
867                 stdin_ev.data.fd = STDIN_FILENO;
868
869                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
870                         log_error("Failed to register STDIN in epoll: %m");
871                         r = -errno;
872                         goto finish;
873                 }
874         }
875
876         zero(stdout_ev);
877         stdout_ev.events = EPOLLOUT|EPOLLET;
878         stdout_ev.data.fd = STDOUT_FILENO;
879
880         zero(master_ev);
881         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
882         master_ev.data.fd = master;
883
884         zero(signal_ev);
885         signal_ev.events = EPOLLIN;
886         signal_ev.data.fd = signal_fd;
887
888         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
889             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
890             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
891                 log_error("Failed to register fds in epoll: %m");
892                 r = -errno;
893                 goto finish;
894         }
895
896         for (;;) {
897                 struct epoll_event ev[16];
898                 ssize_t k;
899                 int i, nfds;
900
901                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
902                 if (nfds < 0) {
903
904                         if (errno == EINTR || errno == EAGAIN)
905                                 continue;
906
907                         log_error("epoll_wait(): %m");
908                         r = -errno;
909                         goto finish;
910                 }
911
912                 assert(nfds >= 1);
913
914                 for (i = 0; i < nfds; i++) {
915                         if (ev[i].data.fd == STDIN_FILENO) {
916
917                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
918                                         stdin_readable = true;
919
920                         } else if (ev[i].data.fd == STDOUT_FILENO) {
921
922                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
923                                         stdout_writable = true;
924
925                         } else if (ev[i].data.fd == master) {
926
927                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
928                                         master_readable = true;
929
930                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
931                                         master_writable = true;
932
933                         } else if (ev[i].data.fd == signal_fd) {
934                                 struct signalfd_siginfo sfsi;
935                                 ssize_t n;
936
937                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
938                                 if (n != sizeof(sfsi)) {
939
940                                         if (n >= 0) {
941                                                 log_error("Failed to read from signalfd: invalid block size");
942                                                 r = -EIO;
943                                                 goto finish;
944                                         }
945
946                                         if (errno != EINTR && errno != EAGAIN) {
947                                                 log_error("Failed to read from signalfd: %m");
948                                                 r = -errno;
949                                                 goto finish;
950                                         }
951                                 } else {
952
953                                         if (sfsi.ssi_signo == SIGWINCH) {
954                                                 struct winsize ws;
955
956                                                 /* The window size changed, let's forward that. */
957                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
958                                                         ioctl(master, TIOCSWINSZ, &ws);
959                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
960
961                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
962
963                                                 /* This only works for systemd... */
964                                                 tried_orderly_shutdown = true;
965                                                 kill(pid, SIGRTMIN+3);
966
967                                         } else {
968                                                 r = 0;
969                                                 goto finish;
970                                         }
971                                 }
972                         }
973                 }
974
975                 while ((stdin_readable && in_buffer_full <= 0) ||
976                        (master_writable && in_buffer_full > 0) ||
977                        (master_readable && out_buffer_full <= 0) ||
978                        (stdout_writable && out_buffer_full > 0)) {
979
980                         if (stdin_readable && in_buffer_full < LINE_MAX) {
981
982                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
983                                 if (k < 0) {
984
985                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
986                                                 stdin_readable = false;
987                                         else {
988                                                 log_error("read(): %m");
989                                                 r = -errno;
990                                                 goto finish;
991                                         }
992                                 } else
993                                         in_buffer_full += (size_t) k;
994                         }
995
996                         if (master_writable && in_buffer_full > 0) {
997
998                                 k = write(master, in_buffer, in_buffer_full);
999                                 if (k < 0) {
1000
1001                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1002                                                 master_writable = false;
1003                                         else {
1004                                                 log_error("write(): %m");
1005                                                 r = -errno;
1006                                                 goto finish;
1007                                         }
1008
1009                                 } else {
1010                                         assert(in_buffer_full >= (size_t) k);
1011                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1012                                         in_buffer_full -= k;
1013                                 }
1014                         }
1015
1016                         if (master_readable && out_buffer_full < LINE_MAX) {
1017
1018                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1019                                 if (k < 0) {
1020
1021                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1022                                                 master_readable = false;
1023                                         else {
1024                                                 log_error("read(): %m");
1025                                                 r = -errno;
1026                                                 goto finish;
1027                                         }
1028                                 }  else
1029                                         out_buffer_full += (size_t) k;
1030                         }
1031
1032                         if (stdout_writable && out_buffer_full > 0) {
1033
1034                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1035                                 if (k < 0) {
1036
1037                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1038                                                 stdout_writable = false;
1039                                         else {
1040                                                 log_error("write(): %m");
1041                                                 r = -errno;
1042                                                 goto finish;
1043                                         }
1044
1045                                 } else {
1046                                         assert(out_buffer_full >= (size_t) k);
1047                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1048                                         out_buffer_full -= k;
1049                                 }
1050                         }
1051                 }
1052         }
1053
1054 finish:
1055         if (ep >= 0)
1056                 close_nointr_nofail(ep);
1057
1058         if (signal_fd >= 0)
1059                 close_nointr_nofail(signal_fd);
1060
1061         return r;
1062 }
1063
1064 int main(int argc, char *argv[]) {
1065         pid_t pid = 0;
1066         int r = EXIT_FAILURE, k;
1067         char *oldcg = NULL, *newcg = NULL;
1068         char **controller = NULL;
1069         int master = -1, n_fd_passed;
1070         const char *console = NULL;
1071         struct termios saved_attr, raw_attr;
1072         sigset_t mask;
1073         bool saved_attr_valid = false;
1074         struct winsize ws;
1075         int kmsg_socket_pair[2] = { -1, -1 };
1076         FDSet *fds = NULL;
1077
1078         log_parse_environment();
1079         log_open();
1080
1081         r = parse_argv(argc, argv);
1082         if (r <= 0)
1083                 goto finish;
1084
1085         if (arg_directory) {
1086                 char *p;
1087
1088                 p = path_make_absolute_cwd(arg_directory);
1089                 free(arg_directory);
1090                 arg_directory = p;
1091         } else
1092                 arg_directory = get_current_dir_name();
1093
1094         if (!arg_directory) {
1095                 log_error("Failed to determine path");
1096                 goto finish;
1097         }
1098
1099         path_kill_slashes(arg_directory);
1100
1101         if (geteuid() != 0) {
1102                 log_error("Need to be root.");
1103                 goto finish;
1104         }
1105
1106         if (sd_booted() <= 0) {
1107                 log_error("Not running on a systemd system.");
1108                 goto finish;
1109         }
1110
1111         if (path_equal(arg_directory, "/")) {
1112                 log_error("Spawning container on root directory not supported.");
1113                 goto finish;
1114         }
1115
1116         if (is_os_tree(arg_directory) <= 0) {
1117                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1118                 goto finish;
1119         }
1120
1121         log_close();
1122         n_fd_passed = sd_listen_fds(false);
1123         if (n_fd_passed > 0) {
1124                 k = fdset_new_listen_fds(&fds, false);
1125                 if (k < 0) {
1126                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1127                         goto finish;
1128                 }
1129         }
1130         fdset_close_others(fds);
1131         log_open();
1132
1133         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1134         if (k < 0) {
1135                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1136                 goto finish;
1137         }
1138
1139         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1140                 log_error("Failed to allocate cgroup path.");
1141                 goto finish;
1142         }
1143
1144         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1145         if (k < 0)  {
1146                 log_error("Failed to create cgroup: %s", strerror(-k));
1147                 goto finish;
1148         }
1149
1150         STRV_FOREACH(controller, arg_controllers) {
1151                 k = cg_create_and_attach(*controller, newcg, 0);
1152                 if (k < 0)
1153                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1154         }
1155
1156         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1157         if (master < 0) {
1158                 log_error("Failed to acquire pseudo tty: %m");
1159                 goto finish;
1160         }
1161
1162         console = ptsname(master);
1163         if (!console) {
1164                 log_error("Failed to determine tty name: %m");
1165                 goto finish;
1166         }
1167
1168         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1169
1170         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1171                 ioctl(master, TIOCSWINSZ, &ws);
1172
1173         if (unlockpt(master) < 0) {
1174                 log_error("Failed to unlock tty: %m");
1175                 goto finish;
1176         }
1177
1178         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1179                 saved_attr_valid = true;
1180
1181                 raw_attr = saved_attr;
1182                 cfmakeraw(&raw_attr);
1183                 raw_attr.c_lflag &= ~ECHO;
1184         }
1185
1186         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1187                 log_error("Failed to create kmsg socket pair");
1188                 goto finish;
1189         }
1190
1191         assert_se(sigemptyset(&mask) == 0);
1192         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1193         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1194
1195         for (;;) {
1196                 siginfo_t status;
1197
1198                 if (saved_attr_valid) {
1199                         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1200                                 log_error("Failed to set terminal attributes: %m");
1201                                 goto finish;
1202                         }
1203                 }
1204
1205                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1206                 if (pid < 0) {
1207                         if (errno == EINVAL)
1208                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1209                         else
1210                                 log_error("clone() failed: %m");
1211
1212                         goto finish;
1213                 }
1214
1215                 if (pid == 0) {
1216                         /* child */
1217
1218                         const char *home = NULL;
1219                         uid_t uid = (uid_t) -1;
1220                         gid_t gid = (gid_t) -1;
1221                         unsigned n_env = 0;
1222                         const char *envp[] = {
1223                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1224                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1225                                 NULL, /* TERM */
1226                                 NULL, /* HOME */
1227                                 NULL, /* USER */
1228                                 NULL, /* LOGNAME */
1229                                 NULL, /* container_uuid */
1230                                 NULL, /* LISTEN_FDS */
1231                                 NULL, /* LISTEN_PID */
1232                                 NULL
1233                         };
1234
1235                         envp[2] = strv_find_prefix(environ, "TERM=");
1236                         n_env = 3;
1237
1238                         close_nointr_nofail(master);
1239                         master = -1;
1240
1241                         close_nointr(STDIN_FILENO);
1242                         close_nointr(STDOUT_FILENO);
1243                         close_nointr(STDERR_FILENO);
1244
1245                         close_nointr_nofail(kmsg_socket_pair[0]);
1246                         kmsg_socket_pair[0] = -1;
1247
1248                         reset_all_signal_handlers();
1249
1250                         assert_se(sigemptyset(&mask) == 0);
1251                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1252
1253                         k = open_terminal(console, O_RDWR);
1254                         if (k != STDIN_FILENO) {
1255                                 if (k >= 0) {
1256                                         close_nointr_nofail(k);
1257                                         k = -EINVAL;
1258                                 }
1259
1260                                 log_error("Failed to open console: %s", strerror(-k));
1261                                 goto child_fail;
1262                         }
1263
1264                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1265                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1266                                 log_error("Failed to duplicate console: %m");
1267                                 goto child_fail;
1268                         }
1269
1270                         if (setsid() < 0) {
1271                                 log_error("setsid() failed: %m");
1272                                 goto child_fail;
1273                         }
1274
1275                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1276                                 log_error("PR_SET_PDEATHSIG failed: %m");
1277                                 goto child_fail;
1278                         }
1279
1280                         /* Mark everything as slave, so that we still
1281                          * receive mounts from the real root, but don't
1282                          * propagate mounts to the real root. */
1283                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1284                                 log_error("MS_SLAVE|MS_REC failed: %m");
1285                                 goto child_fail;
1286                         }
1287
1288                         /* Turn directory into bind mount */
1289                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1290                                 log_error("Failed to make bind mount.");
1291                                 goto child_fail;
1292                         }
1293
1294                         if (arg_read_only)
1295                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1296                                         log_error("Failed to make read-only.");
1297                                         goto child_fail;
1298                                 }
1299
1300                         if (mount_all(arg_directory) < 0)
1301                                 goto child_fail;
1302
1303                         if (copy_devnodes(arg_directory) < 0)
1304                                 goto child_fail;
1305
1306                         dev_setup(arg_directory);
1307
1308                         if (setup_dev_console(arg_directory, console) < 0)
1309                                 goto child_fail;
1310
1311                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1312                                 goto child_fail;
1313
1314                         close_nointr_nofail(kmsg_socket_pair[1]);
1315                         kmsg_socket_pair[1] = -1;
1316
1317                         if (setup_boot_id(arg_directory) < 0)
1318                                 goto child_fail;
1319
1320                         if (setup_timezone(arg_directory) < 0)
1321                                 goto child_fail;
1322
1323                         if (setup_resolv_conf(arg_directory) < 0)
1324                                 goto child_fail;
1325
1326                         if (setup_journal(arg_directory) < 0)
1327                                 goto child_fail;
1328
1329                         if (chdir(arg_directory) < 0) {
1330                                 log_error("chdir(%s) failed: %m", arg_directory);
1331                                 goto child_fail;
1332                         }
1333
1334                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1335                                 log_error("mount(MS_MOVE) failed: %m");
1336                                 goto child_fail;
1337                         }
1338
1339                         if (chroot(".") < 0) {
1340                                 log_error("chroot() failed: %m");
1341                                 goto child_fail;
1342                         }
1343
1344                         if (chdir("/") < 0) {
1345                                 log_error("chdir() failed: %m");
1346                                 goto child_fail;
1347                         }
1348
1349                         umask(0022);
1350
1351                         loopback_setup();
1352
1353                         if (drop_capabilities() < 0) {
1354                                 log_error("drop_capabilities() failed: %m");
1355                                 goto child_fail;
1356                         }
1357
1358                         if (arg_user) {
1359
1360                                 /* Note that this resolves user names
1361                                  * inside the container, and hence
1362                                  * accesses the NSS modules from the
1363                                  * container and not the host. This is
1364                                  * a bit weird... */
1365
1366                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1367                                         log_error("get_user_creds() failed: %m");
1368                                         goto child_fail;
1369                                 }
1370
1371                                 if (mkdir_parents_label(home, 0775) < 0) {
1372                                         log_error("mkdir_parents_label() failed: %m");
1373                                         goto child_fail;
1374                                 }
1375
1376                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1377                                         log_error("mkdir_safe_label() failed: %m");
1378                                         goto child_fail;
1379                                 }
1380
1381                                 if (initgroups((const char*)arg_user, gid) < 0) {
1382                                         log_error("initgroups() failed: %m");
1383                                         goto child_fail;
1384                                 }
1385
1386                                 if (setresgid(gid, gid, gid) < 0) {
1387                                         log_error("setregid() failed: %m");
1388                                         goto child_fail;
1389                                 }
1390
1391                                 if (setresuid(uid, uid, uid) < 0) {
1392                                         log_error("setreuid() failed: %m");
1393                                         goto child_fail;
1394                                 }
1395                         } else {
1396                                 /* Reset everything fully to 0, just in case */
1397
1398                                 if (setgroups(0, NULL) < 0) {
1399                                         log_error("setgroups() failed: %m");
1400                                         goto child_fail;
1401                                 }
1402
1403                                 if (setresgid(0, 0, 0) < 0) {
1404                                         log_error("setregid() failed: %m");
1405                                         goto child_fail;
1406                                 }
1407
1408                                 if (setresuid(0, 0, 0) < 0) {
1409                                         log_error("setreuid() failed: %m");
1410                                         goto child_fail;
1411                                 }
1412                         }
1413
1414                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1415                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1416                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1417                                 log_oom();
1418                                 goto child_fail;
1419                         }
1420
1421                         if (arg_uuid) {
1422                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1423                                         log_oom();
1424                                         goto child_fail;
1425                                 }
1426                         }
1427
1428                         if (fdset_size(fds) > 0) {
1429                                 k = fdset_cloexec(fds, false);
1430                                 if (k < 0) {
1431                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1432                                         goto child_fail;
1433                                 }
1434
1435                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1436                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1437                                         log_oom();
1438                                         goto child_fail;
1439                                 }
1440                         }
1441
1442                         setup_hostname();
1443
1444                         if (arg_boot) {
1445                                 char **a;
1446                                 size_t l;
1447
1448                                 /* Automatically search for the init system */
1449
1450                                 l = 1 + argc - optind;
1451                                 a = newa(char*, l + 1);
1452                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1453
1454                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1455                                 execve(a[0], a, (char**) envp);
1456
1457                                 a[0] = (char*) "/lib/systemd/systemd";
1458                                 execve(a[0], a, (char**) envp);
1459
1460                                 a[0] = (char*) "/sbin/init";
1461                                 execve(a[0], a, (char**) envp);
1462                         } else if (argc > optind)
1463                                 execvpe(argv[optind], argv + optind, (char**) envp);
1464                         else {
1465                                 chdir(home ? home : "/root");
1466                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1467                         }
1468
1469                         log_error("execv() failed: %m");
1470
1471                 child_fail:
1472                         _exit(EXIT_FAILURE);
1473                 }
1474
1475                 fdset_free(fds);
1476                 fds = NULL;
1477
1478                 if (process_pty(master, pid, &mask) < 0)
1479                         goto finish;
1480
1481                 if (saved_attr_valid)
1482                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1483
1484                 r = wait_for_terminate(pid, &status);
1485                 if (r < 0) {
1486                         r = EXIT_FAILURE;
1487                         break;
1488                 }
1489
1490                 if (status.si_code == CLD_EXITED) {
1491                         if (status.si_status != 0) {
1492                                 log_error("Container failed with error code %i.", status.si_status);
1493                                 r = status.si_status;
1494                                 break;
1495                         }
1496
1497                         log_debug("Container exited successfully.");
1498                         break;
1499                 } else if (status.si_code == CLD_KILLED &&
1500                            status.si_status == SIGINT) {
1501                         log_info("Container has been shut down.");
1502                         r = 0;
1503                         break;
1504                 } else if (status.si_code == CLD_KILLED &&
1505                            status.si_status == SIGHUP) {
1506                         log_info("Container is being rebooted.");
1507                         continue;
1508                 } else if (status.si_code == CLD_KILLED ||
1509                            status.si_code == CLD_DUMPED) {
1510
1511                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1512                         r = EXIT_FAILURE;
1513                         break;
1514                 } else {
1515                         log_error("Container failed due to unknown reason.");
1516                         r = EXIT_FAILURE;
1517                         break;
1518                 }
1519         }
1520
1521 finish:
1522         if (saved_attr_valid)
1523                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1524
1525         if (master >= 0)
1526                 close_nointr_nofail(master);
1527
1528         close_pipe(kmsg_socket_pair);
1529
1530         if (oldcg)
1531                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1532
1533         if (newcg)
1534                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1535
1536         free(arg_directory);
1537         strv_free(arg_controllers);
1538         free(oldcg);
1539         free(newcg);
1540
1541         fdset_free(fds);
1542
1543         return r;
1544 }