chiark / gitweb /
1f3bda5b4aded530b46484e6202cc0b0f3cd50e1
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "macro.h"
50 #include "audit.h"
51 #include "missing.h"
52 #include "cgroup-util.h"
53 #include "strv.h"
54 #include "path-util.h"
55 #include "loopback-setup.h"
56 #include "sd-id128.h"
57 #include "dev-setup.h"
58 #include "fdset.h"
59
60 typedef enum LinkJournal {
61         LINK_NO,
62         LINK_AUTO,
63         LINK_HOST,
64         LINK_GUEST
65 } LinkJournal;
66
67 static char *arg_directory = NULL;
68 static char *arg_user = NULL;
69 static char **arg_controllers = NULL;
70 static char *arg_uuid = NULL;
71 static bool arg_private_network = false;
72 static bool arg_read_only = false;
73 static bool arg_boot = false;
74 static LinkJournal arg_link_journal = LINK_AUTO;
75 static uint64_t arg_retain =
76         (1ULL << CAP_CHOWN) |
77         (1ULL << CAP_DAC_OVERRIDE) |
78         (1ULL << CAP_DAC_READ_SEARCH) |
79         (1ULL << CAP_FOWNER) |
80         (1ULL << CAP_FSETID) |
81         (1ULL << CAP_IPC_OWNER) |
82         (1ULL << CAP_KILL) |
83         (1ULL << CAP_LEASE) |
84         (1ULL << CAP_LINUX_IMMUTABLE) |
85         (1ULL << CAP_NET_BIND_SERVICE) |
86         (1ULL << CAP_NET_BROADCAST) |
87         (1ULL << CAP_NET_RAW) |
88         (1ULL << CAP_SETGID) |
89         (1ULL << CAP_SETFCAP) |
90         (1ULL << CAP_SETPCAP) |
91         (1ULL << CAP_SETUID) |
92         (1ULL << CAP_SYS_ADMIN) |
93         (1ULL << CAP_SYS_CHROOT) |
94         (1ULL << CAP_SYS_NICE) |
95         (1ULL << CAP_SYS_PTRACE) |
96         (1ULL << CAP_SYS_TTY_CONFIG) |
97         (1ULL << CAP_SYS_RESOURCE) |
98         (1ULL << CAP_SYS_BOOT);
99
100 static int help(void) {
101
102         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
103                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
104                "  -h --help               Show this help\n"
105                "  -D --directory=NAME     Root directory for the container\n"
106                "  -b --boot               Boot up full system (i.e. invoke init)\n"
107                "  -u --user=USER          Run the command under specified user or uid\n"
108                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
109                "     --uuid=UUID          Set a specific machine UUID for the container\n"
110                "     --private-network    Disable network in container\n"
111                "     --read-only          Mount the root directory read-only\n"
112                "     --capability=CAP     In addition to the default, retain specified capability\n"
113                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
114                "  -j                      Equivalent to --link-journal=host\n",
115                program_invocation_short_name);
116
117         return 0;
118 }
119
120 static int parse_argv(int argc, char *argv[]) {
121
122         enum {
123                 ARG_PRIVATE_NETWORK = 0x100,
124                 ARG_UUID,
125                 ARG_READ_ONLY,
126                 ARG_CAPABILITY,
127                 ARG_LINK_JOURNAL
128         };
129
130         static const struct option options[] = {
131                 { "help",            no_argument,       NULL, 'h'                 },
132                 { "directory",       required_argument, NULL, 'D'                 },
133                 { "user",            required_argument, NULL, 'u'                 },
134                 { "controllers",     required_argument, NULL, 'C'                 },
135                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
136                 { "boot",            no_argument,       NULL, 'b'                 },
137                 { "uuid",            required_argument, NULL, ARG_UUID            },
138                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
139                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
140                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
141                 { NULL,              0,                 NULL, 0                   }
142         };
143
144         int c;
145
146         assert(argc >= 0);
147         assert(argv);
148
149         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
150
151                 switch (c) {
152
153                 case 'h':
154                         help();
155                         return 0;
156
157                 case 'D':
158                         free(arg_directory);
159                         arg_directory = canonicalize_file_name(optarg);
160                         if (!arg_directory) {
161                                 log_error("Failed to canonicalize root directory.");
162                                 return -ENOMEM;
163                         }
164
165                         break;
166
167                 case 'u':
168                         free(arg_user);
169                         if (!(arg_user = strdup(optarg))) {
170                                 log_error("Failed to duplicate user name.");
171                                 return -ENOMEM;
172                         }
173
174                         break;
175
176                 case 'C':
177                         strv_free(arg_controllers);
178                         arg_controllers = strv_split(optarg, ",");
179                         if (!arg_controllers) {
180                                 log_error("Failed to split controllers list.");
181                                 return -ENOMEM;
182                         }
183                         strv_uniq(arg_controllers);
184
185                         break;
186
187                 case ARG_PRIVATE_NETWORK:
188                         arg_private_network = true;
189                         break;
190
191                 case 'b':
192                         arg_boot = true;
193                         break;
194
195                 case ARG_UUID:
196                         arg_uuid = optarg;
197                         break;
198
199                 case ARG_READ_ONLY:
200                         arg_read_only = true;
201                         break;
202
203                 case ARG_CAPABILITY: {
204                         char *state, *word;
205                         size_t length;
206
207                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
208                                 cap_value_t cap;
209                                 char *t;
210
211                                 t = strndup(word, length);
212                                 if (!t)
213                                         return log_oom();
214
215                                 if (cap_from_name(t, &cap) < 0) {
216                                         log_error("Failed to parse capability %s.", t);
217                                         free(t);
218                                         return -EINVAL;
219                                 }
220
221                                 free(t);
222                                 arg_retain |= 1ULL << (uint64_t) cap;
223                         }
224
225                         break;
226                 }
227
228                 case 'j':
229                         arg_link_journal = LINK_GUEST;
230                         break;
231
232                 case ARG_LINK_JOURNAL:
233                         if (streq(optarg, "auto"))
234                                 arg_link_journal = LINK_AUTO;
235                         else if (streq(optarg, "no"))
236                                 arg_link_journal = LINK_NO;
237                         else if (streq(optarg, "guest"))
238                                 arg_link_journal = LINK_GUEST;
239                         else if (streq(optarg, "host"))
240                                 arg_link_journal = LINK_HOST;
241                         else {
242                                 log_error("Failed to parse link journal mode %s", optarg);
243                                 return -EINVAL;
244                         }
245
246                         break;
247
248                 case '?':
249                         return -EINVAL;
250
251                 default:
252                         log_error("Unknown option code %c", c);
253                         return -EINVAL;
254                 }
255         }
256
257         return 1;
258 }
259
260 static int mount_all(const char *dest) {
261
262         typedef struct MountPoint {
263                 const char *what;
264                 const char *where;
265                 const char *type;
266                 const char *options;
267                 unsigned long flags;
268                 bool fatal;
269         } MountPoint;
270
271         static const MountPoint mount_table[] = {
272                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
273                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
274                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
275                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
276                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
277                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
278                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
279                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
280 #ifdef HAVE_SELINUX
281                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
282                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
283 #endif
284         };
285
286         unsigned k;
287         int r = 0;
288
289         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
290                 char _cleanup_free_ *where = NULL;
291                 int t;
292
293                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
294                         log_oom();
295
296                         if (r == 0)
297                                 r = -ENOMEM;
298
299                         break;
300                 }
301
302                 t = path_is_mount_point(where, true);
303                 if (t < 0) {
304                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
305
306                         if (r == 0)
307                                 r = t;
308
309                         continue;
310                 }
311
312                 /* Skip this entry if it is not a remount. */
313                 if (mount_table[k].what && t > 0)
314                         continue;
315
316                 mkdir_p_label(where, 0755);
317
318                 if (mount(mount_table[k].what,
319                           where,
320                           mount_table[k].type,
321                           mount_table[k].flags,
322                           mount_table[k].options) < 0 &&
323                     mount_table[k].fatal) {
324
325                         log_error("mount(%s) failed: %m", where);
326
327                         if (r == 0)
328                                 r = -errno;
329                 }
330         }
331
332         return r;
333 }
334
335 static int setup_timezone(const char *dest) {
336         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
337         char *z, *y;
338         int r;
339
340         assert(dest);
341
342         /* Fix the timezone, if possible */
343         r = readlink_malloc("/etc/localtime", &p);
344         if (r < 0) {
345                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
346                 return 0;
347         }
348
349         z = path_startswith(p, "../usr/share/zoneinfo/");
350         if (!z)
351                 z = path_startswith(p, "/usr/share/zoneinfo/");
352         if (!z) {
353                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
354                 return 0;
355         }
356
357         where = strappend(dest, "/etc/localtime");
358         if (!where)
359                 return log_oom();
360
361         r = readlink_malloc(where, &q);
362         if (r >= 0) {
363                 y = path_startswith(q, "../usr/share/zoneinfo/");
364                 if (!y)
365                         y = path_startswith(q, "/usr/share/zoneinfo/");
366
367
368                 /* Already pointing to the right place? Then do nothing .. */
369                 if (y && streq(y, z))
370                         return 0;
371         }
372
373         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
374         if (!check)
375                 return log_oom();
376
377         if (access(check, F_OK) < 0) {
378                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
379                 return 0;
380         }
381
382         what = strappend("../usr/share/zoneinfo/", z);
383         if (!what)
384                 return log_oom();
385
386         unlink(where);
387         if (symlink(what, where) < 0) {
388                 log_error("Failed to correct timezone of container: %m");
389                 return 0;
390         }
391
392         return 0;
393 }
394
395 static int setup_resolv_conf(const char *dest) {
396         char *where;
397
398         assert(dest);
399
400         if (arg_private_network)
401                 return 0;
402
403         /* Fix resolv.conf, if possible */
404         where = strappend(dest, "/etc/resolv.conf");
405         if (!where)
406                 return log_oom();
407
408         /* We don't really care for the results of this really. If it
409          * fails, it fails, but meh... */
410         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
411                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
412
413         free(where);
414
415         return 0;
416 }
417
418 static int setup_boot_id(const char *dest) {
419         char _cleanup_free_ *from = NULL, *to = NULL;
420         sd_id128_t rnd;
421         char as_uuid[37];
422         int r;
423
424         assert(dest);
425
426         /* Generate a new randomized boot ID, so that each boot-up of
427          * the container gets a new one */
428
429         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
430         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
431         if (!from || !to)
432                 return log_oom();
433
434         r = sd_id128_randomize(&rnd);
435         if (r < 0) {
436                 log_error("Failed to generate random boot id: %s", strerror(-r));
437                 return r;
438         }
439
440         snprintf(as_uuid, sizeof(as_uuid),
441                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
442                  SD_ID128_FORMAT_VAL(rnd));
443         char_array_0(as_uuid);
444
445         r = write_one_line_file(from, as_uuid);
446         if (r < 0) {
447                 log_error("Failed to write boot id: %s", strerror(-r));
448                 return r;
449         }
450
451         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
452                 log_error("Failed to bind mount boot id: %m");
453                 r = -errno;
454         } else
455                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
456
457         unlink(from);
458         return r;
459 }
460
461 static int copy_devnodes(const char *dest) {
462
463         static const char devnodes[] =
464                 "null\0"
465                 "zero\0"
466                 "full\0"
467                 "random\0"
468                 "urandom\0"
469                 "tty\0"
470                 "ptmx\0";
471
472         const char *d;
473         int r = 0;
474         mode_t _cleanup_umask_ u;
475
476         assert(dest);
477
478         u = umask(0000);
479
480         NULSTR_FOREACH(d, devnodes) {
481                 struct stat st;
482                 char _cleanup_free_ *from = NULL, *to = NULL;
483
484                 asprintf(&from, "/dev/%s", d);
485                 asprintf(&to, "%s/dev/%s", dest, d);
486
487                 if (!from || !to) {
488                         log_oom();
489
490                         if (r == 0)
491                                 r = -ENOMEM;
492
493                         break;
494                 }
495
496                 if (stat(from, &st) < 0) {
497
498                         if (errno != ENOENT) {
499                                 log_error("Failed to stat %s: %m", from);
500                                 if (r == 0)
501                                         r = -errno;
502                         }
503
504                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
505
506                         log_error("%s is not a char or block device, cannot copy", from);
507                         if (r == 0)
508                                 r = -EIO;
509
510                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
511
512                         log_error("mknod(%s) failed: %m", dest);
513                         if (r == 0)
514                                 r = -errno;
515                 }
516         }
517
518         return r;
519 }
520
521 static int setup_dev_console(const char *dest, const char *console) {
522         struct stat st;
523         char _cleanup_free_ *to = NULL;
524         int r;
525         mode_t _cleanup_umask_ u;
526
527         assert(dest);
528         assert(console);
529
530         u = umask(0000);
531
532         if (stat(console, &st) < 0) {
533                 log_error("Failed to stat %s: %m", console);
534                 return -errno;
535
536         } else if (!S_ISCHR(st.st_mode)) {
537                 log_error("/dev/console is not a char device");
538                 return -EIO;
539         }
540
541         r = chmod_and_chown(console, 0600, 0, 0);
542         if (r < 0) {
543                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
544                 return r;
545         }
546
547         if (asprintf(&to, "%s/dev/console", dest) < 0)
548                 return log_oom();
549
550         /* We need to bind mount the right tty to /dev/console since
551          * ptys can only exist on pts file systems. To have something
552          * to bind mount things on we create a device node first, that
553          * has the right major/minor (note that the major minor
554          * doesn't actually matter here, since we mount it over
555          * anyway). */
556
557         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
558                 log_error("mknod() for /dev/console failed: %m");
559                 return -errno;
560         }
561
562         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
563                 log_error("Bind mount for /dev/console failed: %m");
564                 return -errno;
565         }
566
567         return 0;
568 }
569
570 static int setup_kmsg(const char *dest, int kmsg_socket) {
571         char _cleanup_free_ *from = NULL, *to = NULL;
572         int r, fd, k;
573         mode_t _cleanup_umask_ u;
574         union {
575                 struct cmsghdr cmsghdr;
576                 uint8_t buf[CMSG_SPACE(sizeof(int))];
577         } control;
578         struct msghdr mh;
579         struct cmsghdr *cmsg;
580
581         assert(dest);
582         assert(kmsg_socket >= 0);
583
584         u = umask(0000);
585
586         /* We create the kmsg FIFO as /dev/kmsg, but immediately
587          * delete it after bind mounting it to /proc/kmsg. While FIFOs
588          * on the reading side behave very similar to /proc/kmsg,
589          * their writing side behaves differently from /dev/kmsg in
590          * that writing blocks when nothing is reading. In order to
591          * avoid any problems with containers deadlocking due to this
592          * we simply make /dev/kmsg unavailable to the container. */
593         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
594             asprintf(&to, "%s/proc/kmsg", dest) < 0)
595                 return log_oom();
596
597         if (mkfifo(from, 0600) < 0) {
598                 log_error("mkfifo() for /dev/kmsg failed: %m");
599                 return -errno;
600         }
601
602         r = chmod_and_chown(from, 0600, 0, 0);
603         if (r < 0) {
604                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
605                 return r;
606         }
607
608         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
609                 log_error("Bind mount for /proc/kmsg failed: %m");
610                 return -errno;
611         }
612
613         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
614         if (fd < 0) {
615                 log_error("Failed to open fifo: %m");
616                 return -errno;
617         }
618
619         zero(mh);
620         zero(control);
621
622         mh.msg_control = &control;
623         mh.msg_controllen = sizeof(control);
624
625         cmsg = CMSG_FIRSTHDR(&mh);
626         cmsg->cmsg_level = SOL_SOCKET;
627         cmsg->cmsg_type = SCM_RIGHTS;
628         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
629         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
630
631         mh.msg_controllen = cmsg->cmsg_len;
632
633         /* Store away the fd in the socket, so that it stays open as
634          * long as we run the child */
635         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
636         close_nointr_nofail(fd);
637
638         if (k < 0) {
639                 log_error("Failed to send FIFO fd: %m");
640                 return -errno;
641         }
642
643         /* And now make the FIFO unavailable as /dev/kmsg... */
644         unlink(from);
645         return 0;
646 }
647
648 static int setup_hostname(void) {
649         char *hn;
650         int r = 0;
651
652         hn = path_get_file_name(arg_directory);
653         if (hn) {
654                 hn = strdup(hn);
655                 if (!hn)
656                         return -ENOMEM;
657
658                 hostname_cleanup(hn);
659
660                 if (!isempty(hn))
661                         if (sethostname(hn, strlen(hn)) < 0)
662                                 r = -errno;
663
664                 free(hn);
665         }
666
667         return r;
668 }
669
670 static int setup_journal(const char *directory) {
671         sd_id128_t machine_id;
672         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
673         char *id;
674         int r;
675
676         if (arg_link_journal == LINK_NO)
677                 return 0;
678
679         p = strappend(directory, "/etc/machine-id");
680         if (!p)
681                 return log_oom();
682
683         r = read_one_line_file(p, &b);
684         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
685                 return 0;
686         else if (r < 0) {
687                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
688                 return r;
689         }
690
691         id = strstrip(b);
692         if (isempty(id) && arg_link_journal == LINK_AUTO)
693                 return 0;
694
695         /* Verify validity */
696         r = sd_id128_from_string(id, &machine_id);
697         if (r < 0) {
698                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
699                 return r;
700         }
701
702         free(p);
703         p = strappend("/var/log/journal/", id);
704         q = strjoin(directory, "/var/log/journal/", id, NULL);
705         if (!p || !q)
706                 return log_oom();
707
708         if (path_is_mount_point(p, false) > 0) {
709                 if (arg_link_journal != LINK_AUTO) {
710                         log_error("%s: already a mount point, refusing to use for journal", p);
711                         return -EEXIST;
712                 }
713
714                 return 0;
715         }
716
717         if (path_is_mount_point(q, false) > 0) {
718                 if (arg_link_journal != LINK_AUTO) {
719                         log_error("%s: already a mount point, refusing to use for journal", q);
720                         return -EEXIST;
721                 }
722
723                 return 0;
724         }
725
726         r = readlink_and_make_absolute(p, &d);
727         if (r >= 0) {
728                 if ((arg_link_journal == LINK_GUEST ||
729                      arg_link_journal == LINK_AUTO) &&
730                     path_equal(d, q)) {
731
732                         r = mkdir_p(q, 0755);
733                         if (r < 0)
734                                 log_warning("failed to create directory %s: %m", q);
735                         return 0;
736                 }
737
738                 if (unlink(p) < 0) {
739                         log_error("Failed to remove symlink %s: %m", p);
740                         return -errno;
741                 }
742         } else if (r == -EINVAL) {
743
744                 if (arg_link_journal == LINK_GUEST &&
745                     rmdir(p) < 0) {
746
747                         if (errno == ENOTDIR) {
748                                 log_error("%s already exists and is neither a symlink nor a directory", p);
749                                 return r;
750                         } else {
751                                 log_error("Failed to remove %s: %m", p);
752                                 return -errno;
753                         }
754                 }
755         } else if (r != -ENOENT) {
756                 log_error("readlink(%s) failed: %m", p);
757                 return r;
758         }
759
760         if (arg_link_journal == LINK_GUEST) {
761
762                 if (symlink(q, p) < 0) {
763                         log_error("Failed to symlink %s to %s: %m", q, p);
764                         return -errno;
765                 }
766
767                 r = mkdir_p(q, 0755);
768                 if (r < 0)
769                         log_warning("failed to create directory %s: %m", q);
770                 return 0;
771         }
772
773         if (arg_link_journal == LINK_HOST) {
774                 r = mkdir_p(p, 0755);
775                 if (r < 0) {
776                         log_error("Failed to create %s: %m", p);
777                         return r;
778                 }
779
780         } else if (access(p, F_OK) < 0)
781                 return 0;
782
783         if (dir_is_empty(q) == 0) {
784                 log_error("%s not empty.", q);
785                 return -ENOTEMPTY;
786         }
787
788         r = mkdir_p(q, 0755);
789         if (r < 0) {
790                 log_error("Failed to create %s: %m", q);
791                 return r;
792         }
793
794         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
795                 log_error("Failed to bind mount journal from host into guest: %m");
796                 return -errno;
797         }
798
799         return 0;
800 }
801
802 static int drop_capabilities(void) {
803         return capability_bounding_set_drop(~arg_retain, false);
804 }
805
806 static int is_os_tree(const char *path) {
807         int r;
808         char *p;
809         /* We use /bin/sh as flag file if something is an OS */
810
811         if (asprintf(&p, "%s/bin/sh", path) < 0)
812                 return -ENOMEM;
813
814         r = access(p, F_OK);
815         free(p);
816
817         return r < 0 ? 0 : 1;
818 }
819
820 static int process_pty(int master, pid_t pid, sigset_t *mask) {
821
822         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
823         size_t in_buffer_full = 0, out_buffer_full = 0;
824         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
825         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
826         int ep = -1, signal_fd = -1, r;
827         bool tried_orderly_shutdown = false;
828
829         assert(master >= 0);
830         assert(pid > 0);
831         assert(mask);
832
833         fd_nonblock(STDIN_FILENO, 1);
834         fd_nonblock(STDOUT_FILENO, 1);
835         fd_nonblock(master, 1);
836
837         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
838         if (signal_fd < 0) {
839                 log_error("signalfd(): %m");
840                 r = -errno;
841                 goto finish;
842         }
843
844         ep = epoll_create1(EPOLL_CLOEXEC);
845         if (ep < 0) {
846                 log_error("Failed to create epoll: %m");
847                 r = -errno;
848                 goto finish;
849         }
850
851         /* We read from STDIN only if this is actually a TTY,
852          * otherwise we assume non-interactivity. */
853         if (isatty(STDIN_FILENO)) {
854                 zero(stdin_ev);
855                 stdin_ev.events = EPOLLIN|EPOLLET;
856                 stdin_ev.data.fd = STDIN_FILENO;
857
858                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
859                         log_error("Failed to register STDIN in epoll: %m");
860                         r = -errno;
861                         goto finish;
862                 }
863         }
864
865         zero(stdout_ev);
866         stdout_ev.events = EPOLLOUT|EPOLLET;
867         stdout_ev.data.fd = STDOUT_FILENO;
868
869         zero(master_ev);
870         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
871         master_ev.data.fd = master;
872
873         zero(signal_ev);
874         signal_ev.events = EPOLLIN;
875         signal_ev.data.fd = signal_fd;
876
877         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
878             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
879             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
880                 log_error("Failed to register fds in epoll: %m");
881                 r = -errno;
882                 goto finish;
883         }
884
885         for (;;) {
886                 struct epoll_event ev[16];
887                 ssize_t k;
888                 int i, nfds;
889
890                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
891                 if (nfds < 0) {
892
893                         if (errno == EINTR || errno == EAGAIN)
894                                 continue;
895
896                         log_error("epoll_wait(): %m");
897                         r = -errno;
898                         goto finish;
899                 }
900
901                 assert(nfds >= 1);
902
903                 for (i = 0; i < nfds; i++) {
904                         if (ev[i].data.fd == STDIN_FILENO) {
905
906                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
907                                         stdin_readable = true;
908
909                         } else if (ev[i].data.fd == STDOUT_FILENO) {
910
911                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
912                                         stdout_writable = true;
913
914                         } else if (ev[i].data.fd == master) {
915
916                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
917                                         master_readable = true;
918
919                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
920                                         master_writable = true;
921
922                         } else if (ev[i].data.fd == signal_fd) {
923                                 struct signalfd_siginfo sfsi;
924                                 ssize_t n;
925
926                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
927                                 if (n != sizeof(sfsi)) {
928
929                                         if (n >= 0) {
930                                                 log_error("Failed to read from signalfd: invalid block size");
931                                                 r = -EIO;
932                                                 goto finish;
933                                         }
934
935                                         if (errno != EINTR && errno != EAGAIN) {
936                                                 log_error("Failed to read from signalfd: %m");
937                                                 r = -errno;
938                                                 goto finish;
939                                         }
940                                 } else {
941
942                                         if (sfsi.ssi_signo == SIGWINCH) {
943                                                 struct winsize ws;
944
945                                                 /* The window size changed, let's forward that. */
946                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
947                                                         ioctl(master, TIOCSWINSZ, &ws);
948                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
949
950                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
951
952                                                 /* This only works for systemd... */
953                                                 tried_orderly_shutdown = true;
954                                                 kill(pid, SIGRTMIN+3);
955
956                                         } else {
957                                                 r = 0;
958                                                 goto finish;
959                                         }
960                                 }
961                         }
962                 }
963
964                 while ((stdin_readable && in_buffer_full <= 0) ||
965                        (master_writable && in_buffer_full > 0) ||
966                        (master_readable && out_buffer_full <= 0) ||
967                        (stdout_writable && out_buffer_full > 0)) {
968
969                         if (stdin_readable && in_buffer_full < LINE_MAX) {
970
971                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
972                                 if (k < 0) {
973
974                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
975                                                 stdin_readable = false;
976                                         else {
977                                                 log_error("read(): %m");
978                                                 r = -errno;
979                                                 goto finish;
980                                         }
981                                 } else
982                                         in_buffer_full += (size_t) k;
983                         }
984
985                         if (master_writable && in_buffer_full > 0) {
986
987                                 k = write(master, in_buffer, in_buffer_full);
988                                 if (k < 0) {
989
990                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
991                                                 master_writable = false;
992                                         else {
993                                                 log_error("write(): %m");
994                                                 r = -errno;
995                                                 goto finish;
996                                         }
997
998                                 } else {
999                                         assert(in_buffer_full >= (size_t) k);
1000                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1001                                         in_buffer_full -= k;
1002                                 }
1003                         }
1004
1005                         if (master_readable && out_buffer_full < LINE_MAX) {
1006
1007                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1008                                 if (k < 0) {
1009
1010                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1011                                                 master_readable = false;
1012                                         else {
1013                                                 log_error("read(): %m");
1014                                                 r = -errno;
1015                                                 goto finish;
1016                                         }
1017                                 }  else
1018                                         out_buffer_full += (size_t) k;
1019                         }
1020
1021                         if (stdout_writable && out_buffer_full > 0) {
1022
1023                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1024                                 if (k < 0) {
1025
1026                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1027                                                 stdout_writable = false;
1028                                         else {
1029                                                 log_error("write(): %m");
1030                                                 r = -errno;
1031                                                 goto finish;
1032                                         }
1033
1034                                 } else {
1035                                         assert(out_buffer_full >= (size_t) k);
1036                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1037                                         out_buffer_full -= k;
1038                                 }
1039                         }
1040                 }
1041         }
1042
1043 finish:
1044         if (ep >= 0)
1045                 close_nointr_nofail(ep);
1046
1047         if (signal_fd >= 0)
1048                 close_nointr_nofail(signal_fd);
1049
1050         return r;
1051 }
1052
1053 int main(int argc, char *argv[]) {
1054         pid_t pid = 0;
1055         int r = EXIT_FAILURE, k;
1056         char *oldcg = NULL, *newcg = NULL;
1057         char **controller = NULL;
1058         int master = -1, n_fd_passed;
1059         const char *console = NULL;
1060         struct termios saved_attr, raw_attr;
1061         sigset_t mask;
1062         bool saved_attr_valid = false;
1063         struct winsize ws;
1064         int kmsg_socket_pair[2] = { -1, -1 };
1065         FDSet *fds = NULL;
1066
1067         log_parse_environment();
1068         log_open();
1069
1070         r = parse_argv(argc, argv);
1071         if (r <= 0)
1072                 goto finish;
1073
1074         if (arg_directory) {
1075                 char *p;
1076
1077                 p = path_make_absolute_cwd(arg_directory);
1078                 free(arg_directory);
1079                 arg_directory = p;
1080         } else
1081                 arg_directory = get_current_dir_name();
1082
1083         if (!arg_directory) {
1084                 log_error("Failed to determine path");
1085                 goto finish;
1086         }
1087
1088         path_kill_slashes(arg_directory);
1089
1090         if (geteuid() != 0) {
1091                 log_error("Need to be root.");
1092                 goto finish;
1093         }
1094
1095         if (sd_booted() <= 0) {
1096                 log_error("Not running on a systemd system.");
1097                 goto finish;
1098         }
1099
1100         if (path_equal(arg_directory, "/")) {
1101                 log_error("Spawning container on root directory not supported.");
1102                 goto finish;
1103         }
1104
1105         if (is_os_tree(arg_directory) <= 0) {
1106                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1107                 goto finish;
1108         }
1109
1110         log_close();
1111         n_fd_passed = sd_listen_fds(false);
1112         if (n_fd_passed > 0) {
1113                 k = fdset_new_listen_fds(&fds, false);
1114                 if (k < 0) {
1115                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1116                         goto finish;
1117                 }
1118         }
1119         fdset_close_others(fds);
1120         log_open();
1121
1122         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1123         if (k < 0) {
1124                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1125                 goto finish;
1126         }
1127
1128         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1129                 log_error("Failed to allocate cgroup path.");
1130                 goto finish;
1131         }
1132
1133         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1134         if (k < 0)  {
1135                 log_error("Failed to create cgroup: %s", strerror(-k));
1136                 goto finish;
1137         }
1138
1139         STRV_FOREACH(controller, arg_controllers) {
1140                 k = cg_create_and_attach(*controller, newcg, 0);
1141                 if (k < 0)
1142                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1143         }
1144
1145         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1146         if (master < 0) {
1147                 log_error("Failed to acquire pseudo tty: %m");
1148                 goto finish;
1149         }
1150
1151         console = ptsname(master);
1152         if (!console) {
1153                 log_error("Failed to determine tty name: %m");
1154                 goto finish;
1155         }
1156
1157         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1158
1159         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1160                 ioctl(master, TIOCSWINSZ, &ws);
1161
1162         if (unlockpt(master) < 0) {
1163                 log_error("Failed to unlock tty: %m");
1164                 goto finish;
1165         }
1166
1167         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1168                 saved_attr_valid = true;
1169
1170                 raw_attr = saved_attr;
1171                 cfmakeraw(&raw_attr);
1172                 raw_attr.c_lflag &= ~ECHO;
1173         }
1174
1175         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1176                 log_error("Failed to create kmsg socket pair");
1177                 goto finish;
1178         }
1179
1180         assert_se(sigemptyset(&mask) == 0);
1181         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1182         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1183
1184         for (;;) {
1185                 siginfo_t status;
1186
1187                 if (saved_attr_valid) {
1188                         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1189                                 log_error("Failed to set terminal attributes: %m");
1190                                 goto finish;
1191                         }
1192                 }
1193
1194                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1195                 if (pid < 0) {
1196                         if (errno == EINVAL)
1197                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1198                         else
1199                                 log_error("clone() failed: %m");
1200
1201                         goto finish;
1202                 }
1203
1204                 if (pid == 0) {
1205                         /* child */
1206
1207                         const char *home = NULL;
1208                         uid_t uid = (uid_t) -1;
1209                         gid_t gid = (gid_t) -1;
1210                         unsigned n_env = 0;
1211                         const char *envp[] = {
1212                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1213                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1214                                 NULL, /* TERM */
1215                                 NULL, /* HOME */
1216                                 NULL, /* USER */
1217                                 NULL, /* LOGNAME */
1218                                 NULL, /* container_uuid */
1219                                 NULL, /* LISTEN_FDS */
1220                                 NULL, /* LISTEN_PID */
1221                                 NULL
1222                         };
1223
1224                         envp[2] = strv_find_prefix(environ, "TERM=");
1225                         n_env = 3;
1226
1227                         close_nointr_nofail(master);
1228                         master = -1;
1229
1230                         close_nointr(STDIN_FILENO);
1231                         close_nointr(STDOUT_FILENO);
1232                         close_nointr(STDERR_FILENO);
1233
1234                         close_nointr_nofail(kmsg_socket_pair[0]);
1235                         kmsg_socket_pair[0] = -1;
1236
1237                         reset_all_signal_handlers();
1238
1239                         assert_se(sigemptyset(&mask) == 0);
1240                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1241
1242                         k = open_terminal(console, O_RDWR);
1243                         if (k != STDIN_FILENO) {
1244                                 if (k >= 0) {
1245                                         close_nointr_nofail(k);
1246                                         k = -EINVAL;
1247                                 }
1248
1249                                 log_error("Failed to open console: %s", strerror(-k));
1250                                 goto child_fail;
1251                         }
1252
1253                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1254                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1255                                 log_error("Failed to duplicate console: %m");
1256                                 goto child_fail;
1257                         }
1258
1259                         if (setsid() < 0) {
1260                                 log_error("setsid() failed: %m");
1261                                 goto child_fail;
1262                         }
1263
1264                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1265                                 log_error("PR_SET_PDEATHSIG failed: %m");
1266                                 goto child_fail;
1267                         }
1268
1269                         /* Mark everything as slave, so that we still
1270                          * receive mounts from the real root, but don't
1271                          * propagate mounts to the real root. */
1272                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1273                                 log_error("MS_SLAVE|MS_REC failed: %m");
1274                                 goto child_fail;
1275                         }
1276
1277                         /* Turn directory into bind mount */
1278                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1279                                 log_error("Failed to make bind mount.");
1280                                 goto child_fail;
1281                         }
1282
1283                         if (arg_read_only)
1284                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1285                                         log_error("Failed to make read-only.");
1286                                         goto child_fail;
1287                                 }
1288
1289                         if (mount_all(arg_directory) < 0)
1290                                 goto child_fail;
1291
1292                         if (copy_devnodes(arg_directory) < 0)
1293                                 goto child_fail;
1294
1295                         dev_setup(arg_directory);
1296
1297                         if (setup_dev_console(arg_directory, console) < 0)
1298                                 goto child_fail;
1299
1300                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1301                                 goto child_fail;
1302
1303                         close_nointr_nofail(kmsg_socket_pair[1]);
1304                         kmsg_socket_pair[1] = -1;
1305
1306                         if (setup_boot_id(arg_directory) < 0)
1307                                 goto child_fail;
1308
1309                         if (setup_timezone(arg_directory) < 0)
1310                                 goto child_fail;
1311
1312                         if (setup_resolv_conf(arg_directory) < 0)
1313                                 goto child_fail;
1314
1315                         if (setup_journal(arg_directory) < 0)
1316                                 goto child_fail;
1317
1318                         if (chdir(arg_directory) < 0) {
1319                                 log_error("chdir(%s) failed: %m", arg_directory);
1320                                 goto child_fail;
1321                         }
1322
1323                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1324                                 log_error("mount(MS_MOVE) failed: %m");
1325                                 goto child_fail;
1326                         }
1327
1328                         if (chroot(".") < 0) {
1329                                 log_error("chroot() failed: %m");
1330                                 goto child_fail;
1331                         }
1332
1333                         if (chdir("/") < 0) {
1334                                 log_error("chdir() failed: %m");
1335                                 goto child_fail;
1336                         }
1337
1338                         umask(0022);
1339
1340                         loopback_setup();
1341
1342                         if (drop_capabilities() < 0) {
1343                                 log_error("drop_capabilities() failed: %m");
1344                                 goto child_fail;
1345                         }
1346
1347                         if (arg_user) {
1348
1349                                 /* Note that this resolves user names
1350                                  * inside the container, and hence
1351                                  * accesses the NSS modules from the
1352                                  * container and not the host. This is
1353                                  * a bit weird... */
1354
1355                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1356                                         log_error("get_user_creds() failed: %m");
1357                                         goto child_fail;
1358                                 }
1359
1360                                 if (mkdir_parents_label(home, 0775) < 0) {
1361                                         log_error("mkdir_parents_label() failed: %m");
1362                                         goto child_fail;
1363                                 }
1364
1365                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1366                                         log_error("mkdir_safe_label() failed: %m");
1367                                         goto child_fail;
1368                                 }
1369
1370                                 if (initgroups((const char*)arg_user, gid) < 0) {
1371                                         log_error("initgroups() failed: %m");
1372                                         goto child_fail;
1373                                 }
1374
1375                                 if (setresgid(gid, gid, gid) < 0) {
1376                                         log_error("setregid() failed: %m");
1377                                         goto child_fail;
1378                                 }
1379
1380                                 if (setresuid(uid, uid, uid) < 0) {
1381                                         log_error("setreuid() failed: %m");
1382                                         goto child_fail;
1383                                 }
1384                         } else {
1385                                 /* Reset everything fully to 0, just in case */
1386
1387                                 if (setgroups(0, NULL) < 0) {
1388                                         log_error("setgroups() failed: %m");
1389                                         goto child_fail;
1390                                 }
1391
1392                                 if (setresgid(0, 0, 0) < 0) {
1393                                         log_error("setregid() failed: %m");
1394                                         goto child_fail;
1395                                 }
1396
1397                                 if (setresuid(0, 0, 0) < 0) {
1398                                         log_error("setreuid() failed: %m");
1399                                         goto child_fail;
1400                                 }
1401                         }
1402
1403                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1404                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1405                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1406                                 log_oom();
1407                                 goto child_fail;
1408                         }
1409
1410                         if (arg_uuid) {
1411                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1412                                         log_oom();
1413                                         goto child_fail;
1414                                 }
1415                         }
1416
1417                         if (fdset_size(fds) > 0) {
1418                                 k = fdset_cloexec(fds, false);
1419                                 if (k < 0) {
1420                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1421                                         goto child_fail;
1422                                 }
1423
1424                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1425                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1426                                         log_oom();
1427                                         goto child_fail;
1428                                 }
1429                         }
1430
1431                         setup_hostname();
1432
1433                         if (arg_boot) {
1434                                 char **a;
1435                                 size_t l;
1436
1437                                 /* Automatically search for the init system */
1438
1439                                 l = 1 + argc - optind;
1440                                 a = newa(char*, l + 1);
1441                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1442
1443                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1444                                 execve(a[0], a, (char**) envp);
1445
1446                                 a[0] = (char*) "/lib/systemd/systemd";
1447                                 execve(a[0], a, (char**) envp);
1448
1449                                 a[0] = (char*) "/sbin/init";
1450                                 execve(a[0], a, (char**) envp);
1451                         } else if (argc > optind)
1452                                 execvpe(argv[optind], argv + optind, (char**) envp);
1453                         else {
1454                                 chdir(home ? home : "/root");
1455                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1456                         }
1457
1458                         log_error("execv() failed: %m");
1459
1460                 child_fail:
1461                         _exit(EXIT_FAILURE);
1462                 }
1463
1464                 fdset_free(fds);
1465                 fds = NULL;
1466
1467                 if (process_pty(master, pid, &mask) < 0)
1468                         goto finish;
1469
1470                 if (saved_attr_valid)
1471                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1472
1473                 r = wait_for_terminate(pid, &status);
1474                 if (r < 0) {
1475                         r = EXIT_FAILURE;
1476                         break;
1477                 }
1478
1479                 if (status.si_code == CLD_EXITED) {
1480                         if (status.si_status != 0) {
1481                                 log_error("Container failed with error code %i.", status.si_status);
1482                                 r = status.si_status;
1483                                 break;
1484                         }
1485
1486                         log_debug("Container exited successfully.");
1487                         break;
1488                 } else if (status.si_code == CLD_KILLED &&
1489                            status.si_status == SIGINT) {
1490                         log_info("Container has been shut down.");
1491                         r = 0;
1492                         break;
1493                 } else if (status.si_code == CLD_KILLED &&
1494                            status.si_status == SIGHUP) {
1495                         log_info("Container is being rebooted.");
1496                         continue;
1497                 } else if (status.si_code == CLD_KILLED ||
1498                            status.si_code == CLD_DUMPED) {
1499
1500                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1501                         r = EXIT_FAILURE;
1502                         break;
1503                 } else {
1504                         log_error("Container failed due to unknown reason.");
1505                         r = EXIT_FAILURE;
1506                         break;
1507                 }
1508         }
1509
1510 finish:
1511         if (saved_attr_valid)
1512                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1513
1514         if (master >= 0)
1515                 close_nointr_nofail(master);
1516
1517         close_pipe(kmsg_socket_pair);
1518
1519         if (oldcg)
1520                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1521
1522         if (newcg)
1523                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1524
1525         free(arg_directory);
1526         strv_free(arg_controllers);
1527         free(oldcg);
1528         free(newcg);
1529
1530         fdset_free(fds);
1531
1532         return r;
1533 }