chiark / gitweb /
c346f58412e6add45d8d4a42752d2608a068e96a
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 typedef enum LinkJournal {
64         LINK_NO,
65         LINK_AUTO,
66         LINK_HOST,
67         LINK_GUEST
68 } LinkJournal;
69
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
79         (1ULL << CAP_CHOWN) |
80         (1ULL << CAP_DAC_OVERRIDE) |
81         (1ULL << CAP_DAC_READ_SEARCH) |
82         (1ULL << CAP_FOWNER) |
83         (1ULL << CAP_FSETID) |
84         (1ULL << CAP_IPC_OWNER) |
85         (1ULL << CAP_KILL) |
86         (1ULL << CAP_LEASE) |
87         (1ULL << CAP_LINUX_IMMUTABLE) |
88         (1ULL << CAP_NET_BIND_SERVICE) |
89         (1ULL << CAP_NET_BROADCAST) |
90         (1ULL << CAP_NET_RAW) |
91         (1ULL << CAP_SETGID) |
92         (1ULL << CAP_SETFCAP) |
93         (1ULL << CAP_SETPCAP) |
94         (1ULL << CAP_SETUID) |
95         (1ULL << CAP_SYS_ADMIN) |
96         (1ULL << CAP_SYS_CHROOT) |
97         (1ULL << CAP_SYS_NICE) |
98         (1ULL << CAP_SYS_PTRACE) |
99         (1ULL << CAP_SYS_TTY_CONFIG) |
100         (1ULL << CAP_SYS_RESOURCE) |
101         (1ULL << CAP_SYS_BOOT) |
102         (1ULL << CAP_AUDIT_WRITE) |
103         (1ULL << CAP_AUDIT_CONTROL);
104
105 static int help(void) {
106
107         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
108                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
109                "  -h --help               Show this help\n"
110                "  --version               Print version string\n"
111                "  -D --directory=NAME     Root directory for the container\n"
112                "  -b --boot               Boot up full system (i.e. invoke init)\n"
113                "  -u --user=USER          Run the command under specified user or uid\n"
114                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
115                "     --uuid=UUID          Set a specific machine UUID for the container\n"
116                "     --private-network    Disable network in container\n"
117                "     --read-only          Mount the root directory read-only\n"
118                "     --capability=CAP     In addition to the default, retain specified capability\n"
119                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
120                "  -j                      Equivalent to --link-journal=host\n",
121                program_invocation_short_name);
122
123         return 0;
124 }
125
126 static int parse_argv(int argc, char *argv[]) {
127
128         enum {
129                 ARG_VERSION = 0x100,
130                 ARG_PRIVATE_NETWORK,
131                 ARG_UUID,
132                 ARG_READ_ONLY,
133                 ARG_CAPABILITY,
134                 ARG_LINK_JOURNAL
135         };
136
137         static const struct option options[] = {
138                 { "help",            no_argument,       NULL, 'h'                 },
139                 { "version",         no_argument,       NULL, ARG_VERSION         },
140                 { "directory",       required_argument, NULL, 'D'                 },
141                 { "user",            required_argument, NULL, 'u'                 },
142                 { "controllers",     required_argument, NULL, 'C'                 },
143                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
144                 { "boot",            no_argument,       NULL, 'b'                 },
145                 { "uuid",            required_argument, NULL, ARG_UUID            },
146                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
147                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
148                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
149                 { NULL,              0,                 NULL, 0                   }
150         };
151
152         int c;
153
154         assert(argc >= 0);
155         assert(argv);
156
157         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
158
159                 switch (c) {
160
161                 case 'h':
162                         help();
163                         return 0;
164
165                 case ARG_VERSION:
166                         puts(PACKAGE_STRING);
167                         puts(SYSTEMD_FEATURES);
168                         return 0;
169
170                 case 'D':
171                         free(arg_directory);
172                         arg_directory = canonicalize_file_name(optarg);
173                         if (!arg_directory) {
174                                 log_error("Failed to canonicalize root directory.");
175                                 return -ENOMEM;
176                         }
177
178                         break;
179
180                 case 'u':
181                         free(arg_user);
182                         if (!(arg_user = strdup(optarg))) {
183                                 log_error("Failed to duplicate user name.");
184                                 return -ENOMEM;
185                         }
186
187                         break;
188
189                 case 'C':
190                         strv_free(arg_controllers);
191                         arg_controllers = strv_split(optarg, ",");
192                         if (!arg_controllers) {
193                                 log_error("Failed to split controllers list.");
194                                 return -ENOMEM;
195                         }
196                         strv_uniq(arg_controllers);
197
198                         break;
199
200                 case ARG_PRIVATE_NETWORK:
201                         arg_private_network = true;
202                         break;
203
204                 case 'b':
205                         arg_boot = true;
206                         break;
207
208                 case ARG_UUID:
209                         arg_uuid = optarg;
210                         break;
211
212                 case ARG_READ_ONLY:
213                         arg_read_only = true;
214                         break;
215
216                 case ARG_CAPABILITY: {
217                         char *state, *word;
218                         size_t length;
219
220                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
221                                 cap_value_t cap;
222                                 char *t;
223
224                                 t = strndup(word, length);
225                                 if (!t)
226                                         return log_oom();
227
228                                 if (cap_from_name(t, &cap) < 0) {
229                                         log_error("Failed to parse capability %s.", t);
230                                         free(t);
231                                         return -EINVAL;
232                                 }
233
234                                 free(t);
235                                 arg_retain |= 1ULL << (uint64_t) cap;
236                         }
237
238                         break;
239                 }
240
241                 case 'j':
242                         arg_link_journal = LINK_GUEST;
243                         break;
244
245                 case ARG_LINK_JOURNAL:
246                         if (streq(optarg, "auto"))
247                                 arg_link_journal = LINK_AUTO;
248                         else if (streq(optarg, "no"))
249                                 arg_link_journal = LINK_NO;
250                         else if (streq(optarg, "guest"))
251                                 arg_link_journal = LINK_GUEST;
252                         else if (streq(optarg, "host"))
253                                 arg_link_journal = LINK_HOST;
254                         else {
255                                 log_error("Failed to parse link journal mode %s", optarg);
256                                 return -EINVAL;
257                         }
258
259                         break;
260
261                 case '?':
262                         return -EINVAL;
263
264                 default:
265                         log_error("Unknown option code %c", c);
266                         return -EINVAL;
267                 }
268         }
269
270         if (optind < argc && arg_boot) {
271                 log_error("Cannot specify a command together with '-b'");
272                 return -EINVAL;
273         }
274
275         return 1;
276 }
277
278 static int mount_all(const char *dest) {
279
280         typedef struct MountPoint {
281                 const char *what;
282                 const char *where;
283                 const char *type;
284                 const char *options;
285                 unsigned long flags;
286                 bool fatal;
287         } MountPoint;
288
289         static const MountPoint mount_table[] = {
290                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
291                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
292                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
293                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
294                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
295                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
296                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
297                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
298 #ifdef HAVE_SELINUX
299                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
300                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
301 #endif
302         };
303
304         unsigned k;
305         int r = 0;
306
307         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
308                 char _cleanup_free_ *where = NULL;
309                 int t;
310
311                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
312                         log_oom();
313
314                         if (r == 0)
315                                 r = -ENOMEM;
316
317                         break;
318                 }
319
320                 t = path_is_mount_point(where, true);
321                 if (t < 0) {
322                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
323
324                         if (r == 0)
325                                 r = t;
326
327                         continue;
328                 }
329
330                 /* Skip this entry if it is not a remount. */
331                 if (mount_table[k].what && t > 0)
332                         continue;
333
334                 mkdir_p_label(where, 0755);
335
336                 if (mount(mount_table[k].what,
337                           where,
338                           mount_table[k].type,
339                           mount_table[k].flags,
340                           mount_table[k].options) < 0 &&
341                     mount_table[k].fatal) {
342
343                         log_error("mount(%s) failed: %m", where);
344
345                         if (r == 0)
346                                 r = -errno;
347                 }
348         }
349
350         return r;
351 }
352
353 static int setup_timezone(const char *dest) {
354         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
355         char *z, *y;
356         int r;
357
358         assert(dest);
359
360         /* Fix the timezone, if possible */
361         r = readlink_malloc("/etc/localtime", &p);
362         if (r < 0) {
363                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
364                 return 0;
365         }
366
367         z = path_startswith(p, "../usr/share/zoneinfo/");
368         if (!z)
369                 z = path_startswith(p, "/usr/share/zoneinfo/");
370         if (!z) {
371                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
372                 return 0;
373         }
374
375         where = strappend(dest, "/etc/localtime");
376         if (!where)
377                 return log_oom();
378
379         r = readlink_malloc(where, &q);
380         if (r >= 0) {
381                 y = path_startswith(q, "../usr/share/zoneinfo/");
382                 if (!y)
383                         y = path_startswith(q, "/usr/share/zoneinfo/");
384
385
386                 /* Already pointing to the right place? Then do nothing .. */
387                 if (y && streq(y, z))
388                         return 0;
389         }
390
391         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
392         if (!check)
393                 return log_oom();
394
395         if (access(check, F_OK) < 0) {
396                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
397                 return 0;
398         }
399
400         what = strappend("../usr/share/zoneinfo/", z);
401         if (!what)
402                 return log_oom();
403
404         unlink(where);
405         if (symlink(what, where) < 0) {
406                 log_error("Failed to correct timezone of container: %m");
407                 return 0;
408         }
409
410         return 0;
411 }
412
413 static int setup_resolv_conf(const char *dest) {
414         char *where;
415
416         assert(dest);
417
418         if (arg_private_network)
419                 return 0;
420
421         /* Fix resolv.conf, if possible */
422         where = strappend(dest, "/etc/resolv.conf");
423         if (!where)
424                 return log_oom();
425
426         /* We don't really care for the results of this really. If it
427          * fails, it fails, but meh... */
428         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
429                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
430
431         free(where);
432
433         return 0;
434 }
435
436 static int setup_boot_id(const char *dest) {
437         char _cleanup_free_ *from = NULL, *to = NULL;
438         sd_id128_t rnd;
439         char as_uuid[37];
440         int r;
441
442         assert(dest);
443
444         /* Generate a new randomized boot ID, so that each boot-up of
445          * the container gets a new one */
446
447         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
448         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
449         if (!from || !to)
450                 return log_oom();
451
452         r = sd_id128_randomize(&rnd);
453         if (r < 0) {
454                 log_error("Failed to generate random boot id: %s", strerror(-r));
455                 return r;
456         }
457
458         snprintf(as_uuid, sizeof(as_uuid),
459                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
460                  SD_ID128_FORMAT_VAL(rnd));
461         char_array_0(as_uuid);
462
463         r = write_one_line_file(from, as_uuid);
464         if (r < 0) {
465                 log_error("Failed to write boot id: %s", strerror(-r));
466                 return r;
467         }
468
469         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
470                 log_error("Failed to bind mount boot id: %m");
471                 r = -errno;
472         } else
473                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
474
475         unlink(from);
476         return r;
477 }
478
479 static int copy_devnodes(const char *dest) {
480
481         static const char devnodes[] =
482                 "null\0"
483                 "zero\0"
484                 "full\0"
485                 "random\0"
486                 "urandom\0"
487                 "tty\0"
488                 "ptmx\0";
489
490         const char *d;
491         int r = 0;
492         mode_t _cleanup_umask_ u;
493
494         assert(dest);
495
496         u = umask(0000);
497
498         NULSTR_FOREACH(d, devnodes) {
499                 struct stat st;
500                 char _cleanup_free_ *from = NULL, *to = NULL;
501
502                 asprintf(&from, "/dev/%s", d);
503                 asprintf(&to, "%s/dev/%s", dest, d);
504
505                 if (!from || !to) {
506                         log_oom();
507
508                         if (r == 0)
509                                 r = -ENOMEM;
510
511                         break;
512                 }
513
514                 if (stat(from, &st) < 0) {
515
516                         if (errno != ENOENT) {
517                                 log_error("Failed to stat %s: %m", from);
518                                 if (r == 0)
519                                         r = -errno;
520                         }
521
522                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
523
524                         log_error("%s is not a char or block device, cannot copy", from);
525                         if (r == 0)
526                                 r = -EIO;
527
528                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
529
530                         log_error("mknod(%s) failed: %m", dest);
531                         if (r == 0)
532                                 r = -errno;
533                 }
534         }
535
536         return r;
537 }
538
539 static int setup_dev_console(const char *dest, const char *console) {
540         struct stat st;
541         char _cleanup_free_ *to = NULL;
542         int r;
543         mode_t _cleanup_umask_ u;
544
545         assert(dest);
546         assert(console);
547
548         u = umask(0000);
549
550         if (stat(console, &st) < 0) {
551                 log_error("Failed to stat %s: %m", console);
552                 return -errno;
553
554         } else if (!S_ISCHR(st.st_mode)) {
555                 log_error("/dev/console is not a char device");
556                 return -EIO;
557         }
558
559         r = chmod_and_chown(console, 0600, 0, 0);
560         if (r < 0) {
561                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
562                 return r;
563         }
564
565         if (asprintf(&to, "%s/dev/console", dest) < 0)
566                 return log_oom();
567
568         /* We need to bind mount the right tty to /dev/console since
569          * ptys can only exist on pts file systems. To have something
570          * to bind mount things on we create a device node first, that
571          * has the right major/minor (note that the major minor
572          * doesn't actually matter here, since we mount it over
573          * anyway). */
574
575         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
576                 log_error("mknod() for /dev/console failed: %m");
577                 return -errno;
578         }
579
580         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
581                 log_error("Bind mount for /dev/console failed: %m");
582                 return -errno;
583         }
584
585         return 0;
586 }
587
588 static int setup_kmsg(const char *dest, int kmsg_socket) {
589         char _cleanup_free_ *from = NULL, *to = NULL;
590         int r, fd, k;
591         mode_t _cleanup_umask_ u;
592         union {
593                 struct cmsghdr cmsghdr;
594                 uint8_t buf[CMSG_SPACE(sizeof(int))];
595         } control;
596         struct msghdr mh;
597         struct cmsghdr *cmsg;
598
599         assert(dest);
600         assert(kmsg_socket >= 0);
601
602         u = umask(0000);
603
604         /* We create the kmsg FIFO as /dev/kmsg, but immediately
605          * delete it after bind mounting it to /proc/kmsg. While FIFOs
606          * on the reading side behave very similar to /proc/kmsg,
607          * their writing side behaves differently from /dev/kmsg in
608          * that writing blocks when nothing is reading. In order to
609          * avoid any problems with containers deadlocking due to this
610          * we simply make /dev/kmsg unavailable to the container. */
611         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
612             asprintf(&to, "%s/proc/kmsg", dest) < 0)
613                 return log_oom();
614
615         if (mkfifo(from, 0600) < 0) {
616                 log_error("mkfifo() for /dev/kmsg failed: %m");
617                 return -errno;
618         }
619
620         r = chmod_and_chown(from, 0600, 0, 0);
621         if (r < 0) {
622                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
623                 return r;
624         }
625
626         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
627                 log_error("Bind mount for /proc/kmsg failed: %m");
628                 return -errno;
629         }
630
631         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
632         if (fd < 0) {
633                 log_error("Failed to open fifo: %m");
634                 return -errno;
635         }
636
637         zero(mh);
638         zero(control);
639
640         mh.msg_control = &control;
641         mh.msg_controllen = sizeof(control);
642
643         cmsg = CMSG_FIRSTHDR(&mh);
644         cmsg->cmsg_level = SOL_SOCKET;
645         cmsg->cmsg_type = SCM_RIGHTS;
646         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
647         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
648
649         mh.msg_controllen = cmsg->cmsg_len;
650
651         /* Store away the fd in the socket, so that it stays open as
652          * long as we run the child */
653         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
654         close_nointr_nofail(fd);
655
656         if (k < 0) {
657                 log_error("Failed to send FIFO fd: %m");
658                 return -errno;
659         }
660
661         /* And now make the FIFO unavailable as /dev/kmsg... */
662         unlink(from);
663         return 0;
664 }
665
666 static int setup_hostname(void) {
667         char *hn;
668         int r = 0;
669
670         hn = path_get_file_name(arg_directory);
671         if (hn) {
672                 hn = strdup(hn);
673                 if (!hn)
674                         return -ENOMEM;
675
676                 hostname_cleanup(hn);
677
678                 if (!isempty(hn))
679                         if (sethostname(hn, strlen(hn)) < 0)
680                                 r = -errno;
681
682                 free(hn);
683         }
684
685         return r;
686 }
687
688 static int setup_journal(const char *directory) {
689         sd_id128_t machine_id;
690         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
691         char *id;
692         int r;
693
694         if (arg_link_journal == LINK_NO)
695                 return 0;
696
697         p = strappend(directory, "/etc/machine-id");
698         if (!p)
699                 return log_oom();
700
701         r = read_one_line_file(p, &b);
702         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
703                 return 0;
704         else if (r < 0) {
705                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
706                 return r;
707         }
708
709         id = strstrip(b);
710         if (isempty(id) && arg_link_journal == LINK_AUTO)
711                 return 0;
712
713         /* Verify validity */
714         r = sd_id128_from_string(id, &machine_id);
715         if (r < 0) {
716                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
717                 return r;
718         }
719
720         free(p);
721         p = strappend("/var/log/journal/", id);
722         q = strjoin(directory, "/var/log/journal/", id, NULL);
723         if (!p || !q)
724                 return log_oom();
725
726         if (path_is_mount_point(p, false) > 0) {
727                 if (arg_link_journal != LINK_AUTO) {
728                         log_error("%s: already a mount point, refusing to use for journal", p);
729                         return -EEXIST;
730                 }
731
732                 return 0;
733         }
734
735         if (path_is_mount_point(q, false) > 0) {
736                 if (arg_link_journal != LINK_AUTO) {
737                         log_error("%s: already a mount point, refusing to use for journal", q);
738                         return -EEXIST;
739                 }
740
741                 return 0;
742         }
743
744         r = readlink_and_make_absolute(p, &d);
745         if (r >= 0) {
746                 if ((arg_link_journal == LINK_GUEST ||
747                      arg_link_journal == LINK_AUTO) &&
748                     path_equal(d, q)) {
749
750                         r = mkdir_p(q, 0755);
751                         if (r < 0)
752                                 log_warning("failed to create directory %s: %m", q);
753                         return 0;
754                 }
755
756                 if (unlink(p) < 0) {
757                         log_error("Failed to remove symlink %s: %m", p);
758                         return -errno;
759                 }
760         } else if (r == -EINVAL) {
761
762                 if (arg_link_journal == LINK_GUEST &&
763                     rmdir(p) < 0) {
764
765                         if (errno == ENOTDIR) {
766                                 log_error("%s already exists and is neither a symlink nor a directory", p);
767                                 return r;
768                         } else {
769                                 log_error("Failed to remove %s: %m", p);
770                                 return -errno;
771                         }
772                 }
773         } else if (r != -ENOENT) {
774                 log_error("readlink(%s) failed: %m", p);
775                 return r;
776         }
777
778         if (arg_link_journal == LINK_GUEST) {
779
780                 if (symlink(q, p) < 0) {
781                         log_error("Failed to symlink %s to %s: %m", q, p);
782                         return -errno;
783                 }
784
785                 r = mkdir_p(q, 0755);
786                 if (r < 0)
787                         log_warning("failed to create directory %s: %m", q);
788                 return 0;
789         }
790
791         if (arg_link_journal == LINK_HOST) {
792                 r = mkdir_p(p, 0755);
793                 if (r < 0) {
794                         log_error("Failed to create %s: %m", p);
795                         return r;
796                 }
797
798         } else if (access(p, F_OK) < 0)
799                 return 0;
800
801         if (dir_is_empty(q) == 0) {
802                 log_error("%s not empty.", q);
803                 return -ENOTEMPTY;
804         }
805
806         r = mkdir_p(q, 0755);
807         if (r < 0) {
808                 log_error("Failed to create %s: %m", q);
809                 return r;
810         }
811
812         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
813                 log_error("Failed to bind mount journal from host into guest: %m");
814                 return -errno;
815         }
816
817         return 0;
818 }
819
820 static int drop_capabilities(void) {
821         return capability_bounding_set_drop(~arg_retain, false);
822 }
823
824 static int is_os_tree(const char *path) {
825         int r;
826         char *p;
827         /* We use /bin/sh as flag file if something is an OS */
828
829         if (asprintf(&p, "%s/bin/sh", path) < 0)
830                 return -ENOMEM;
831
832         r = access(p, F_OK);
833         free(p);
834
835         return r < 0 ? 0 : 1;
836 }
837
838 static int process_pty(int master, pid_t pid, sigset_t *mask) {
839
840         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
841         size_t in_buffer_full = 0, out_buffer_full = 0;
842         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
843         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
844         int ep = -1, signal_fd = -1, r;
845         bool tried_orderly_shutdown = false;
846
847         assert(master >= 0);
848         assert(pid > 0);
849         assert(mask);
850
851         fd_nonblock(STDIN_FILENO, 1);
852         fd_nonblock(STDOUT_FILENO, 1);
853         fd_nonblock(master, 1);
854
855         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
856         if (signal_fd < 0) {
857                 log_error("signalfd(): %m");
858                 r = -errno;
859                 goto finish;
860         }
861
862         ep = epoll_create1(EPOLL_CLOEXEC);
863         if (ep < 0) {
864                 log_error("Failed to create epoll: %m");
865                 r = -errno;
866                 goto finish;
867         }
868
869         /* We read from STDIN only if this is actually a TTY,
870          * otherwise we assume non-interactivity. */
871         if (isatty(STDIN_FILENO)) {
872                 zero(stdin_ev);
873                 stdin_ev.events = EPOLLIN|EPOLLET;
874                 stdin_ev.data.fd = STDIN_FILENO;
875
876                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
877                         log_error("Failed to register STDIN in epoll: %m");
878                         r = -errno;
879                         goto finish;
880                 }
881         }
882
883         zero(stdout_ev);
884         stdout_ev.events = EPOLLOUT|EPOLLET;
885         stdout_ev.data.fd = STDOUT_FILENO;
886
887         zero(master_ev);
888         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
889         master_ev.data.fd = master;
890
891         zero(signal_ev);
892         signal_ev.events = EPOLLIN;
893         signal_ev.data.fd = signal_fd;
894
895         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
896                 if (errno != EPERM) {
897                         log_error("Failed to register stdout in epoll: %m");
898                         r = -errno;
899                         goto finish;
900                 }
901                 /* stdout without epoll support. Likely redirected to regular file. */
902                 stdout_writable = true;
903         }
904
905         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
906             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
907                 log_error("Failed to register fds in epoll: %m");
908                 r = -errno;
909                 goto finish;
910         }
911
912         for (;;) {
913                 struct epoll_event ev[16];
914                 ssize_t k;
915                 int i, nfds;
916
917                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
918                 if (nfds < 0) {
919
920                         if (errno == EINTR || errno == EAGAIN)
921                                 continue;
922
923                         log_error("epoll_wait(): %m");
924                         r = -errno;
925                         goto finish;
926                 }
927
928                 assert(nfds >= 1);
929
930                 for (i = 0; i < nfds; i++) {
931                         if (ev[i].data.fd == STDIN_FILENO) {
932
933                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
934                                         stdin_readable = true;
935
936                         } else if (ev[i].data.fd == STDOUT_FILENO) {
937
938                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
939                                         stdout_writable = true;
940
941                         } else if (ev[i].data.fd == master) {
942
943                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
944                                         master_readable = true;
945
946                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
947                                         master_writable = true;
948
949                         } else if (ev[i].data.fd == signal_fd) {
950                                 struct signalfd_siginfo sfsi;
951                                 ssize_t n;
952
953                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
954                                 if (n != sizeof(sfsi)) {
955
956                                         if (n >= 0) {
957                                                 log_error("Failed to read from signalfd: invalid block size");
958                                                 r = -EIO;
959                                                 goto finish;
960                                         }
961
962                                         if (errno != EINTR && errno != EAGAIN) {
963                                                 log_error("Failed to read from signalfd: %m");
964                                                 r = -errno;
965                                                 goto finish;
966                                         }
967                                 } else {
968
969                                         if (sfsi.ssi_signo == SIGWINCH) {
970                                                 struct winsize ws;
971
972                                                 /* The window size changed, let's forward that. */
973                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
974                                                         ioctl(master, TIOCSWINSZ, &ws);
975                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
976
977                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
978
979                                                 /* This only works for systemd... */
980                                                 tried_orderly_shutdown = true;
981                                                 kill(pid, SIGRTMIN+3);
982
983                                         } else {
984                                                 r = 0;
985                                                 goto finish;
986                                         }
987                                 }
988                         }
989                 }
990
991                 while ((stdin_readable && in_buffer_full <= 0) ||
992                        (master_writable && in_buffer_full > 0) ||
993                        (master_readable && out_buffer_full <= 0) ||
994                        (stdout_writable && out_buffer_full > 0)) {
995
996                         if (stdin_readable && in_buffer_full < LINE_MAX) {
997
998                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
999                                 if (k < 0) {
1000
1001                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1002                                                 stdin_readable = false;
1003                                         else {
1004                                                 log_error("read(): %m");
1005                                                 r = -errno;
1006                                                 goto finish;
1007                                         }
1008                                 } else
1009                                         in_buffer_full += (size_t) k;
1010                         }
1011
1012                         if (master_writable && in_buffer_full > 0) {
1013
1014                                 k = write(master, in_buffer, in_buffer_full);
1015                                 if (k < 0) {
1016
1017                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1018                                                 master_writable = false;
1019                                         else {
1020                                                 log_error("write(): %m");
1021                                                 r = -errno;
1022                                                 goto finish;
1023                                         }
1024
1025                                 } else {
1026                                         assert(in_buffer_full >= (size_t) k);
1027                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1028                                         in_buffer_full -= k;
1029                                 }
1030                         }
1031
1032                         if (master_readable && out_buffer_full < LINE_MAX) {
1033
1034                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1035                                 if (k < 0) {
1036
1037                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1038                                                 master_readable = false;
1039                                         else {
1040                                                 log_error("read(): %m");
1041                                                 r = -errno;
1042                                                 goto finish;
1043                                         }
1044                                 }  else
1045                                         out_buffer_full += (size_t) k;
1046                         }
1047
1048                         if (stdout_writable && out_buffer_full > 0) {
1049
1050                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1051                                 if (k < 0) {
1052
1053                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1054                                                 stdout_writable = false;
1055                                         else {
1056                                                 log_error("write(): %m");
1057                                                 r = -errno;
1058                                                 goto finish;
1059                                         }
1060
1061                                 } else {
1062                                         assert(out_buffer_full >= (size_t) k);
1063                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1064                                         out_buffer_full -= k;
1065                                 }
1066                         }
1067                 }
1068         }
1069
1070 finish:
1071         if (ep >= 0)
1072                 close_nointr_nofail(ep);
1073
1074         if (signal_fd >= 0)
1075                 close_nointr_nofail(signal_fd);
1076
1077         return r;
1078 }
1079
1080 int main(int argc, char *argv[]) {
1081         pid_t pid = 0;
1082         int r = EXIT_FAILURE, k;
1083         char *oldcg = NULL, *newcg = NULL;
1084         char **controller = NULL;
1085         int master = -1, n_fd_passed;
1086         const char *console = NULL;
1087         struct termios saved_attr, raw_attr;
1088         sigset_t mask;
1089         bool saved_attr_valid = false;
1090         struct winsize ws;
1091         int kmsg_socket_pair[2] = { -1, -1 };
1092         FDSet *fds = NULL;
1093
1094         log_parse_environment();
1095         log_open();
1096
1097         r = parse_argv(argc, argv);
1098         if (r <= 0)
1099                 goto finish;
1100
1101         if (arg_directory) {
1102                 char *p;
1103
1104                 p = path_make_absolute_cwd(arg_directory);
1105                 free(arg_directory);
1106                 arg_directory = p;
1107         } else
1108                 arg_directory = get_current_dir_name();
1109
1110         if (!arg_directory) {
1111                 log_error("Failed to determine path");
1112                 goto finish;
1113         }
1114
1115         path_kill_slashes(arg_directory);
1116
1117         if (geteuid() != 0) {
1118                 log_error("Need to be root.");
1119                 goto finish;
1120         }
1121
1122         if (sd_booted() <= 0) {
1123                 log_error("Not running on a systemd system.");
1124                 goto finish;
1125         }
1126
1127         if (path_equal(arg_directory, "/")) {
1128                 log_error("Spawning container on root directory not supported.");
1129                 goto finish;
1130         }
1131
1132         if (is_os_tree(arg_directory) <= 0) {
1133                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1134                 goto finish;
1135         }
1136
1137         log_close();
1138         n_fd_passed = sd_listen_fds(false);
1139         if (n_fd_passed > 0) {
1140                 k = fdset_new_listen_fds(&fds, false);
1141                 if (k < 0) {
1142                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1143                         goto finish;
1144                 }
1145         }
1146         fdset_close_others(fds);
1147         log_open();
1148
1149         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1150         if (k < 0) {
1151                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1152                 goto finish;
1153         }
1154
1155         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1156                 log_error("Failed to allocate cgroup path.");
1157                 goto finish;
1158         }
1159
1160         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1161         if (k < 0)  {
1162                 log_error("Failed to create cgroup: %s", strerror(-k));
1163                 goto finish;
1164         }
1165
1166         STRV_FOREACH(controller, arg_controllers) {
1167                 k = cg_create_and_attach(*controller, newcg, 0);
1168                 if (k < 0)
1169                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1170         }
1171
1172         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1173         if (master < 0) {
1174                 log_error("Failed to acquire pseudo tty: %m");
1175                 goto finish;
1176         }
1177
1178         console = ptsname(master);
1179         if (!console) {
1180                 log_error("Failed to determine tty name: %m");
1181                 goto finish;
1182         }
1183
1184         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1185
1186         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1187                 ioctl(master, TIOCSWINSZ, &ws);
1188
1189         if (unlockpt(master) < 0) {
1190                 log_error("Failed to unlock tty: %m");
1191                 goto finish;
1192         }
1193
1194         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1195                 saved_attr_valid = true;
1196
1197                 raw_attr = saved_attr;
1198                 cfmakeraw(&raw_attr);
1199                 raw_attr.c_lflag &= ~ECHO;
1200         }
1201
1202         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1203                 log_error("Failed to create kmsg socket pair");
1204                 goto finish;
1205         }
1206
1207         assert_se(sigemptyset(&mask) == 0);
1208         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1209         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1210
1211         for (;;) {
1212                 siginfo_t status;
1213                 int pipefd[2];
1214
1215                 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1216                         log_error("pipe2(): %m");
1217                         goto finish;
1218                 }
1219
1220                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1221                 if (pid < 0) {
1222                         if (errno == EINVAL)
1223                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1224                         else
1225                                 log_error("clone() failed: %m");
1226
1227                         goto finish;
1228                 }
1229
1230                 if (pid == 0) {
1231                         /* child */
1232                         const char *home = NULL;
1233                         uid_t uid = (uid_t) -1;
1234                         gid_t gid = (gid_t) -1;
1235                         unsigned n_env = 0;
1236                         const char *envp[] = {
1237                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1238                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1239                                 NULL, /* TERM */
1240                                 NULL, /* HOME */
1241                                 NULL, /* USER */
1242                                 NULL, /* LOGNAME */
1243                                 NULL, /* container_uuid */
1244                                 NULL, /* LISTEN_FDS */
1245                                 NULL, /* LISTEN_PID */
1246                                 NULL
1247                         };
1248
1249                         envp[2] = strv_find_prefix(environ, "TERM=");
1250                         n_env = 3;
1251
1252                         close_nointr_nofail(pipefd[1]);
1253                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1254                         close_nointr_nofail(pipefd[0]);
1255
1256                         close_nointr_nofail(master);
1257                         master = -1;
1258
1259                         if (saved_attr_valid) {
1260                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1261                                         log_error("Failed to set terminal attributes: %m");
1262                                         goto child_fail;
1263                                 }
1264                         }
1265
1266                         close_nointr(STDIN_FILENO);
1267                         close_nointr(STDOUT_FILENO);
1268                         close_nointr(STDERR_FILENO);
1269
1270                         close_nointr_nofail(kmsg_socket_pair[0]);
1271                         kmsg_socket_pair[0] = -1;
1272
1273                         reset_all_signal_handlers();
1274
1275                         assert_se(sigemptyset(&mask) == 0);
1276                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1277
1278                         k = open_terminal(console, O_RDWR);
1279                         if (k != STDIN_FILENO) {
1280                                 if (k >= 0) {
1281                                         close_nointr_nofail(k);
1282                                         k = -EINVAL;
1283                                 }
1284
1285                                 log_error("Failed to open console: %s", strerror(-k));
1286                                 goto child_fail;
1287                         }
1288
1289                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1290                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1291                                 log_error("Failed to duplicate console: %m");
1292                                 goto child_fail;
1293                         }
1294
1295                         if (setsid() < 0) {
1296                                 log_error("setsid() failed: %m");
1297                                 goto child_fail;
1298                         }
1299
1300                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1301                                 log_error("PR_SET_PDEATHSIG failed: %m");
1302                                 goto child_fail;
1303                         }
1304
1305                         /* Mark everything as slave, so that we still
1306                          * receive mounts from the real root, but don't
1307                          * propagate mounts to the real root. */
1308                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1309                                 log_error("MS_SLAVE|MS_REC failed: %m");
1310                                 goto child_fail;
1311                         }
1312
1313                         /* Turn directory into bind mount */
1314                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1315                                 log_error("Failed to make bind mount.");
1316                                 goto child_fail;
1317                         }
1318
1319                         if (arg_read_only)
1320                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1321                                         log_error("Failed to make read-only.");
1322                                         goto child_fail;
1323                                 }
1324
1325                         if (mount_all(arg_directory) < 0)
1326                                 goto child_fail;
1327
1328                         if (copy_devnodes(arg_directory) < 0)
1329                                 goto child_fail;
1330
1331                         dev_setup(arg_directory);
1332
1333                         if (setup_dev_console(arg_directory, console) < 0)
1334                                 goto child_fail;
1335
1336                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1337                                 goto child_fail;
1338
1339                         close_nointr_nofail(kmsg_socket_pair[1]);
1340                         kmsg_socket_pair[1] = -1;
1341
1342                         if (setup_boot_id(arg_directory) < 0)
1343                                 goto child_fail;
1344
1345                         if (setup_timezone(arg_directory) < 0)
1346                                 goto child_fail;
1347
1348                         if (setup_resolv_conf(arg_directory) < 0)
1349                                 goto child_fail;
1350
1351                         if (setup_journal(arg_directory) < 0)
1352                                 goto child_fail;
1353
1354                         if (chdir(arg_directory) < 0) {
1355                                 log_error("chdir(%s) failed: %m", arg_directory);
1356                                 goto child_fail;
1357                         }
1358
1359                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1360                                 log_error("mount(MS_MOVE) failed: %m");
1361                                 goto child_fail;
1362                         }
1363
1364                         if (chroot(".") < 0) {
1365                                 log_error("chroot() failed: %m");
1366                                 goto child_fail;
1367                         }
1368
1369                         if (chdir("/") < 0) {
1370                                 log_error("chdir() failed: %m");
1371                                 goto child_fail;
1372                         }
1373
1374                         umask(0022);
1375
1376                         loopback_setup();
1377
1378                         if (drop_capabilities() < 0) {
1379                                 log_error("drop_capabilities() failed: %m");
1380                                 goto child_fail;
1381                         }
1382
1383                         if (arg_user) {
1384
1385                                 /* Note that this resolves user names
1386                                  * inside the container, and hence
1387                                  * accesses the NSS modules from the
1388                                  * container and not the host. This is
1389                                  * a bit weird... */
1390
1391                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1392                                         log_error("get_user_creds() failed: %m");
1393                                         goto child_fail;
1394                                 }
1395
1396                                 if (mkdir_parents_label(home, 0775) < 0) {
1397                                         log_error("mkdir_parents_label() failed: %m");
1398                                         goto child_fail;
1399                                 }
1400
1401                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1402                                         log_error("mkdir_safe_label() failed: %m");
1403                                         goto child_fail;
1404                                 }
1405
1406                                 if (initgroups((const char*)arg_user, gid) < 0) {
1407                                         log_error("initgroups() failed: %m");
1408                                         goto child_fail;
1409                                 }
1410
1411                                 if (setresgid(gid, gid, gid) < 0) {
1412                                         log_error("setregid() failed: %m");
1413                                         goto child_fail;
1414                                 }
1415
1416                                 if (setresuid(uid, uid, uid) < 0) {
1417                                         log_error("setreuid() failed: %m");
1418                                         goto child_fail;
1419                                 }
1420                         } else {
1421                                 /* Reset everything fully to 0, just in case */
1422
1423                                 if (setgroups(0, NULL) < 0) {
1424                                         log_error("setgroups() failed: %m");
1425                                         goto child_fail;
1426                                 }
1427
1428                                 if (setresgid(0, 0, 0) < 0) {
1429                                         log_error("setregid() failed: %m");
1430                                         goto child_fail;
1431                                 }
1432
1433                                 if (setresuid(0, 0, 0) < 0) {
1434                                         log_error("setreuid() failed: %m");
1435                                         goto child_fail;
1436                                 }
1437                         }
1438
1439                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1440                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1441                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1442                                 log_oom();
1443                                 goto child_fail;
1444                         }
1445
1446                         if (arg_uuid) {
1447                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1448                                         log_oom();
1449                                         goto child_fail;
1450                                 }
1451                         }
1452
1453                         if (fdset_size(fds) > 0) {
1454                                 k = fdset_cloexec(fds, false);
1455                                 if (k < 0) {
1456                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1457                                         goto child_fail;
1458                                 }
1459
1460                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1461                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1462                                         log_oom();
1463                                         goto child_fail;
1464                                 }
1465                         }
1466
1467                         setup_hostname();
1468
1469                         if (arg_boot) {
1470                                 char **a;
1471                                 size_t l;
1472
1473                                 /* Automatically search for the init system */
1474
1475                                 l = 1 + argc - optind;
1476                                 a = newa(char*, l + 1);
1477                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1478
1479                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1480                                 execve(a[0], a, (char**) envp);
1481
1482                                 a[0] = (char*) "/lib/systemd/systemd";
1483                                 execve(a[0], a, (char**) envp);
1484
1485                                 a[0] = (char*) "/sbin/init";
1486                                 execve(a[0], a, (char**) envp);
1487                         } else if (argc > optind)
1488                                 execvpe(argv[optind], argv + optind, (char**) envp);
1489                         else {
1490                                 chdir(home ? home : "/root");
1491                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1492                         }
1493
1494                         log_error("execv() failed: %m");
1495
1496                 child_fail:
1497                         _exit(EXIT_FAILURE);
1498                 }
1499
1500                 log_info("Init process in the container running as PID %d", pid);
1501                 close_nointr_nofail(pipefd[0]);
1502                 close_nointr_nofail(pipefd[1]);
1503
1504                 fdset_free(fds);
1505                 fds = NULL;
1506
1507                 if (process_pty(master, pid, &mask) < 0)
1508                         goto finish;
1509
1510                 if (saved_attr_valid)
1511                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1512
1513                 r = wait_for_terminate(pid, &status);
1514                 if (r < 0) {
1515                         r = EXIT_FAILURE;
1516                         break;
1517                 }
1518
1519                 if (status.si_code == CLD_EXITED) {
1520                         if (status.si_status != 0) {
1521                                 log_error("Container failed with error code %i.", status.si_status);
1522                                 r = status.si_status;
1523                                 break;
1524                         }
1525
1526                         log_debug("Container exited successfully.");
1527                         break;
1528                 } else if (status.si_code == CLD_KILLED &&
1529                            status.si_status == SIGINT) {
1530                         log_info("Container has been shut down.");
1531                         r = 0;
1532                         break;
1533                 } else if (status.si_code == CLD_KILLED &&
1534                            status.si_status == SIGHUP) {
1535                         log_info("Container is being rebooted.");
1536                         continue;
1537                 } else if (status.si_code == CLD_KILLED ||
1538                            status.si_code == CLD_DUMPED) {
1539
1540                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1541                         r = EXIT_FAILURE;
1542                         break;
1543                 } else {
1544                         log_error("Container failed due to unknown reason.");
1545                         r = EXIT_FAILURE;
1546                         break;
1547                 }
1548         }
1549
1550 finish:
1551         if (saved_attr_valid)
1552                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1553
1554         if (master >= 0)
1555                 close_nointr_nofail(master);
1556
1557         close_pipe(kmsg_socket_pair);
1558
1559         if (oldcg)
1560                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1561
1562         if (newcg)
1563                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1564
1565         free(arg_directory);
1566         strv_free(arg_controllers);
1567         free(oldcg);
1568         free(newcg);
1569
1570         fdset_free(fds);
1571
1572         return r;
1573 }