chiark / gitweb /
d772b478fd8fd3cc6f51defcb32e771f2778b967
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <attr/xattr.h>
26 #include <sys/types.h>
27 #include <sys/syscall.h>
28 #include <sys/mount.h>
29 #include <sys/wait.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <stdio.h>
33 #include <errno.h>
34 #include <sys/prctl.h>
35 #include <sys/capability.h>
36 #include <getopt.h>
37 #include <sys/poll.h>
38 #include <sys/epoll.h>
39 #include <termios.h>
40 #include <sys/signalfd.h>
41 #include <grp.h>
42 #include <linux/fs.h>
43 #include <sys/un.h>
44 #include <sys/socket.h>
45
46 #include <systemd/sd-daemon.h>
47
48 #include "log.h"
49 #include "util.h"
50 #include "mkdir.h"
51 #include "macro.h"
52 #include "audit.h"
53 #include "missing.h"
54 #include "cgroup-util.h"
55 #include "strv.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
58 #include "sd-id128.h"
59 #include "dev-setup.h"
60 #include "fdset.h"
61 #include "build.h"
62 #include "fileio.h"
63
64 #ifndef TTY_GID
65 #define TTY_GID 5
66 #endif
67
68 typedef enum LinkJournal {
69         LINK_NO,
70         LINK_AUTO,
71         LINK_HOST,
72         LINK_GUEST
73 } LinkJournal;
74
75 static char *arg_directory = NULL;
76 static char *arg_user = NULL;
77 static char **arg_controllers = NULL;
78 static char *arg_uuid = NULL;
79 static char *arg_machine = NULL;
80 static bool arg_private_network = false;
81 static bool arg_read_only = false;
82 static bool arg_boot = false;
83 static LinkJournal arg_link_journal = LINK_AUTO;
84 static uint64_t arg_retain =
85         (1ULL << CAP_CHOWN) |
86         (1ULL << CAP_DAC_OVERRIDE) |
87         (1ULL << CAP_DAC_READ_SEARCH) |
88         (1ULL << CAP_FOWNER) |
89         (1ULL << CAP_FSETID) |
90         (1ULL << CAP_IPC_OWNER) |
91         (1ULL << CAP_KILL) |
92         (1ULL << CAP_LEASE) |
93         (1ULL << CAP_LINUX_IMMUTABLE) |
94         (1ULL << CAP_NET_BIND_SERVICE) |
95         (1ULL << CAP_NET_BROADCAST) |
96         (1ULL << CAP_NET_RAW) |
97         (1ULL << CAP_SETGID) |
98         (1ULL << CAP_SETFCAP) |
99         (1ULL << CAP_SETPCAP) |
100         (1ULL << CAP_SETUID) |
101         (1ULL << CAP_SYS_ADMIN) |
102         (1ULL << CAP_SYS_CHROOT) |
103         (1ULL << CAP_SYS_NICE) |
104         (1ULL << CAP_SYS_PTRACE) |
105         (1ULL << CAP_SYS_TTY_CONFIG) |
106         (1ULL << CAP_SYS_RESOURCE) |
107         (1ULL << CAP_SYS_BOOT) |
108         (1ULL << CAP_AUDIT_WRITE) |
109         (1ULL << CAP_AUDIT_CONTROL);
110 static char **arg_bind = NULL;
111 static char **arg_bind_ro = NULL;
112
113 static int help(void) {
114
115         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
116                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
117                "  -h --help                Show this help\n"
118                "     --version             Print version string\n"
119                "  -D --directory=NAME      Root directory for the container\n"
120                "  -b --boot                Boot up full system (i.e. invoke init)\n"
121                "  -u --user=USER           Run the command under specified user or uid\n"
122                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
123                "                           cgroup hierarchies\n"
124                "     --uuid=UUID           Set a specific machine UUID for the container\n"
125                "  -M --machine=NAME        Set the machine name for the container\n"
126                "     --private-network     Disable network in container\n"
127                "     --read-only           Mount the root directory read-only\n"
128                "     --capability=CAP      In addition to the default, retain specified\n"
129                "                           capability\n"
130                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
131                "  -j                       Equivalent to --link-journal=host\n"
132                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
133                "                           the container\n"
134                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135                program_invocation_short_name);
136
137         return 0;
138 }
139
140 static int parse_argv(int argc, char *argv[]) {
141
142         enum {
143                 ARG_VERSION = 0x100,
144                 ARG_PRIVATE_NETWORK,
145                 ARG_UUID,
146                 ARG_READ_ONLY,
147                 ARG_CAPABILITY,
148                 ARG_LINK_JOURNAL,
149                 ARG_BIND,
150                 ARG_BIND_RO
151         };
152
153         static const struct option options[] = {
154                 { "help",            no_argument,       NULL, 'h'                 },
155                 { "version",         no_argument,       NULL, ARG_VERSION         },
156                 { "directory",       required_argument, NULL, 'D'                 },
157                 { "user",            required_argument, NULL, 'u'                 },
158                 { "controllers",     required_argument, NULL, 'C'                 },
159                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
160                 { "boot",            no_argument,       NULL, 'b'                 },
161                 { "uuid",            required_argument, NULL, ARG_UUID            },
162                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
163                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
164                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
165                 { "bind",            required_argument, NULL, ARG_BIND            },
166                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
167                 { "machine",         required_argument, NULL, 'M'                 },
168                 { NULL,              0,                 NULL, 0                   }
169         };
170
171         int c;
172
173         assert(argc >= 0);
174         assert(argv);
175
176         while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
177
178                 switch (c) {
179
180                 case 'h':
181                         help();
182                         return 0;
183
184                 case ARG_VERSION:
185                         puts(PACKAGE_STRING);
186                         puts(SYSTEMD_FEATURES);
187                         return 0;
188
189                 case 'D':
190                         free(arg_directory);
191                         arg_directory = canonicalize_file_name(optarg);
192                         if (!arg_directory) {
193                                 log_error("Failed to canonicalize root directory.");
194                                 return -ENOMEM;
195                         }
196
197                         break;
198
199                 case 'u':
200                         free(arg_user);
201                         arg_user = strdup(optarg);
202                         if (!arg_user)
203                                 return log_oom();
204
205                         break;
206
207                 case 'C':
208                         strv_free(arg_controllers);
209                         arg_controllers = strv_split(optarg, ",");
210                         if (!arg_controllers)
211                                 return log_oom();
212
213                         cg_shorten_controllers(arg_controllers);
214                         break;
215
216                 case ARG_PRIVATE_NETWORK:
217                         arg_private_network = true;
218                         break;
219
220                 case 'b':
221                         arg_boot = true;
222                         break;
223
224                 case ARG_UUID:
225                         if (!id128_is_valid(optarg)) {
226                                 log_error("Invalid UUID: %s", optarg);
227                                 return -EINVAL;
228                         }
229
230                         arg_uuid = optarg;
231                         break;
232
233                 case 'M':
234                         if (!hostname_is_valid(optarg)) {
235                                 log_error("Invalid machine name: %s", optarg);
236                                 return -EINVAL;
237                         }
238
239                         free(arg_machine);
240                         arg_machine = strdup(optarg);
241                         if (!arg_machine)
242                                 return log_oom();
243
244                         break;
245
246                 case ARG_READ_ONLY:
247                         arg_read_only = true;
248                         break;
249
250                 case ARG_CAPABILITY: {
251                         char *state, *word;
252                         size_t length;
253
254                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
255                                 cap_value_t cap;
256                                 char *t;
257
258                                 t = strndup(word, length);
259                                 if (!t)
260                                         return log_oom();
261
262                                 if (cap_from_name(t, &cap) < 0) {
263                                         log_error("Failed to parse capability %s.", t);
264                                         free(t);
265                                         return -EINVAL;
266                                 }
267
268                                 free(t);
269                                 arg_retain |= 1ULL << (uint64_t) cap;
270                         }
271
272                         break;
273                 }
274
275                 case 'j':
276                         arg_link_journal = LINK_GUEST;
277                         break;
278
279                 case ARG_LINK_JOURNAL:
280                         if (streq(optarg, "auto"))
281                                 arg_link_journal = LINK_AUTO;
282                         else if (streq(optarg, "no"))
283                                 arg_link_journal = LINK_NO;
284                         else if (streq(optarg, "guest"))
285                                 arg_link_journal = LINK_GUEST;
286                         else if (streq(optarg, "host"))
287                                 arg_link_journal = LINK_HOST;
288                         else {
289                                 log_error("Failed to parse link journal mode %s", optarg);
290                                 return -EINVAL;
291                         }
292
293                         break;
294
295                 case ARG_BIND:
296                 case ARG_BIND_RO: {
297                         _cleanup_free_ char *a = NULL, *b = NULL;
298                         char *e;
299                         char ***x;
300                         int r;
301
302                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
303
304                         e = strchr(optarg, ':');
305                         if (e) {
306                                 a = strndup(optarg, e - optarg);
307                                 b = strdup(e + 1);
308                         } else {
309                                 a = strdup(optarg);
310                                 b = strdup(optarg);
311                         }
312
313                         if (!a || !b)
314                                 return log_oom();
315
316                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
317                                 log_error("Invalid bind mount specification: %s", optarg);
318                                 return -EINVAL;
319                         }
320
321                         r = strv_extend(x, a);
322                         if (r < 0)
323                                 return r;
324
325                         r = strv_extend(x, b);
326                         if (r < 0)
327                                 return r;
328
329                         break;
330                 }
331
332                 case '?':
333                         return -EINVAL;
334
335                 default:
336                         log_error("Unknown option code %c", c);
337                         return -EINVAL;
338                 }
339         }
340
341         return 1;
342 }
343
344 static int mount_all(const char *dest) {
345
346         typedef struct MountPoint {
347                 const char *what;
348                 const char *where;
349                 const char *type;
350                 const char *options;
351                 unsigned long flags;
352                 bool fatal;
353         } MountPoint;
354
355         static const MountPoint mount_table[] = {
356                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
357                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
358                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
359                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
361                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
362                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
363                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
364 #ifdef HAVE_SELINUX
365                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
366                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
367 #endif
368         };
369
370         unsigned k;
371         int r = 0;
372
373         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
374                 _cleanup_free_ char *where = NULL;
375                 int t;
376
377                 where = strjoin(dest, "/", mount_table[k].where, NULL);
378                 if (!where)
379                         return log_oom();
380
381                 t = path_is_mount_point(where, true);
382                 if (t < 0) {
383                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
384
385                         if (r == 0)
386                                 r = t;
387
388                         continue;
389                 }
390
391                 /* Skip this entry if it is not a remount. */
392                 if (mount_table[k].what && t > 0)
393                         continue;
394
395                 mkdir_p(where, 0755);
396
397                 if (mount(mount_table[k].what,
398                           where,
399                           mount_table[k].type,
400                           mount_table[k].flags,
401                           mount_table[k].options) < 0 &&
402                     mount_table[k].fatal) {
403
404                         log_error("mount(%s) failed: %m", where);
405
406                         if (r == 0)
407                                 r = -errno;
408                 }
409         }
410
411         return r;
412 }
413
414 static int mount_binds(const char *dest, char **l, unsigned long flags) {
415         char **x, **y;
416
417         STRV_FOREACH_PAIR(x, y, l) {
418                 _cleanup_free_ char *where = NULL;
419
420                 where = strjoin(dest, "/", *y, NULL);
421                 if (!where)
422                         return log_oom();
423
424                 mkdir_p_label(where, 0755);
425
426                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
427                         log_error("mount(%s) failed: %m", where);
428                         return -errno;
429                 }
430
431                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
432                         log_error("mount(%s) failed: %m", where);
433                         return -errno;
434                 }
435         }
436
437         return 0;
438 }
439
440 static int setup_timezone(const char *dest) {
441         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
442         char *z, *y;
443         int r;
444
445         assert(dest);
446
447         /* Fix the timezone, if possible */
448         r = readlink_malloc("/etc/localtime", &p);
449         if (r < 0) {
450                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
451                 return 0;
452         }
453
454         z = path_startswith(p, "../usr/share/zoneinfo/");
455         if (!z)
456                 z = path_startswith(p, "/usr/share/zoneinfo/");
457         if (!z) {
458                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
459                 return 0;
460         }
461
462         where = strappend(dest, "/etc/localtime");
463         if (!where)
464                 return log_oom();
465
466         r = readlink_malloc(where, &q);
467         if (r >= 0) {
468                 y = path_startswith(q, "../usr/share/zoneinfo/");
469                 if (!y)
470                         y = path_startswith(q, "/usr/share/zoneinfo/");
471
472
473                 /* Already pointing to the right place? Then do nothing .. */
474                 if (y && streq(y, z))
475                         return 0;
476         }
477
478         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
479         if (!check)
480                 return log_oom();
481
482         if (access(check, F_OK) < 0) {
483                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
484                 return 0;
485         }
486
487         what = strappend("../usr/share/zoneinfo/", z);
488         if (!what)
489                 return log_oom();
490
491         unlink(where);
492         if (symlink(what, where) < 0) {
493                 log_error("Failed to correct timezone of container: %m");
494                 return 0;
495         }
496
497         return 0;
498 }
499
500 static int setup_resolv_conf(const char *dest) {
501         char _cleanup_free_ *where = NULL;
502         _cleanup_close_ int fd = -1;
503
504         assert(dest);
505
506         if (arg_private_network)
507                 return 0;
508
509         /* Fix resolv.conf, if possible */
510         where = strappend(dest, "/etc/resolv.conf");
511         if (!where)
512                 return log_oom();
513
514         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
515
516         /* We don't really care for the results of this really. If it
517          * fails, it fails, but meh... */
518         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
519                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
520         else
521                 if (mount("/etc/resolv.conf", where, "bind",
522                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
523                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
524                         return -errno;
525                 }
526
527         return 0;
528 }
529
530 static int setup_boot_id(const char *dest) {
531         _cleanup_free_ char *from = NULL, *to = NULL;
532         sd_id128_t rnd;
533         char as_uuid[37];
534         int r;
535
536         assert(dest);
537
538         /* Generate a new randomized boot ID, so that each boot-up of
539          * the container gets a new one */
540
541         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
542         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
543         if (!from || !to)
544                 return log_oom();
545
546         r = sd_id128_randomize(&rnd);
547         if (r < 0) {
548                 log_error("Failed to generate random boot id: %s", strerror(-r));
549                 return r;
550         }
551
552         snprintf(as_uuid, sizeof(as_uuid),
553                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
554                  SD_ID128_FORMAT_VAL(rnd));
555         char_array_0(as_uuid);
556
557         r = write_string_file(from, as_uuid);
558         if (r < 0) {
559                 log_error("Failed to write boot id: %s", strerror(-r));
560                 return r;
561         }
562
563         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
564                 log_error("Failed to bind mount boot id: %m");
565                 r = -errno;
566         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
567                 log_warning("Failed to make boot id read-only: %m");
568
569         unlink(from);
570         return r;
571 }
572
573 static int copy_devnodes(const char *dest) {
574
575         static const char devnodes[] =
576                 "null\0"
577                 "zero\0"
578                 "full\0"
579                 "random\0"
580                 "urandom\0"
581                 "tty\0";
582
583         const char *d;
584         int r = 0;
585         _cleanup_umask_ mode_t u;
586
587         assert(dest);
588
589         u = umask(0000);
590
591         NULSTR_FOREACH(d, devnodes) {
592                 struct stat st;
593                 _cleanup_free_ char *from = NULL, *to = NULL;
594
595                 asprintf(&from, "/dev/%s", d);
596                 asprintf(&to, "%s/dev/%s", dest, d);
597
598                 if (!from || !to) {
599                         log_oom();
600
601                         if (r == 0)
602                                 r = -ENOMEM;
603
604                         break;
605                 }
606
607                 if (stat(from, &st) < 0) {
608
609                         if (errno != ENOENT) {
610                                 log_error("Failed to stat %s: %m", from);
611                                 if (r == 0)
612                                         r = -errno;
613                         }
614
615                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
616
617                         log_error("%s is not a char or block device, cannot copy", from);
618                         if (r == 0)
619                                 r = -EIO;
620
621                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
622
623                         log_error("mknod(%s) failed: %m", dest);
624                         if (r == 0)
625                                 r = -errno;
626                 }
627         }
628
629         return r;
630 }
631
632 static int setup_ptmx(const char *dest) {
633         _cleanup_free_ char *p = NULL;
634
635         p = strappend(dest, "/dev/ptmx");
636         if (!p)
637                 return log_oom();
638
639         if (symlink("pts/ptmx", p) < 0) {
640                 log_error("Failed to create /dev/ptmx symlink: %m");
641                 return -errno;
642         }
643
644         return 0;
645 }
646
647 static int setup_dev_console(const char *dest, const char *console) {
648         struct stat st;
649         _cleanup_free_ char *to = NULL;
650         int r;
651         _cleanup_umask_ mode_t u;
652
653         assert(dest);
654         assert(console);
655
656         u = umask(0000);
657
658         if (stat(console, &st) < 0) {
659                 log_error("Failed to stat %s: %m", console);
660                 return -errno;
661
662         } else if (!S_ISCHR(st.st_mode)) {
663                 log_error("/dev/console is not a char device");
664                 return -EIO;
665         }
666
667         r = chmod_and_chown(console, 0600, 0, 0);
668         if (r < 0) {
669                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
670                 return r;
671         }
672
673         if (asprintf(&to, "%s/dev/console", dest) < 0)
674                 return log_oom();
675
676         /* We need to bind mount the right tty to /dev/console since
677          * ptys can only exist on pts file systems. To have something
678          * to bind mount things on we create a device node first, that
679          * has the right major/minor (note that the major minor
680          * doesn't actually matter here, since we mount it over
681          * anyway). */
682
683         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
684                 log_error("mknod() for /dev/console failed: %m");
685                 return -errno;
686         }
687
688         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
689                 log_error("Bind mount for /dev/console failed: %m");
690                 return -errno;
691         }
692
693         return 0;
694 }
695
696 static int setup_kmsg(const char *dest, int kmsg_socket) {
697         _cleanup_free_ char *from = NULL, *to = NULL;
698         int r, fd, k;
699         _cleanup_umask_ mode_t u;
700         union {
701                 struct cmsghdr cmsghdr;
702                 uint8_t buf[CMSG_SPACE(sizeof(int))];
703         } control = {};
704         struct msghdr mh = {
705                 .msg_control = &control,
706                 .msg_controllen = sizeof(control),
707         };
708         struct cmsghdr *cmsg;
709
710         assert(dest);
711         assert(kmsg_socket >= 0);
712
713         u = umask(0000);
714
715         /* We create the kmsg FIFO as /dev/kmsg, but immediately
716          * delete it after bind mounting it to /proc/kmsg. While FIFOs
717          * on the reading side behave very similar to /proc/kmsg,
718          * their writing side behaves differently from /dev/kmsg in
719          * that writing blocks when nothing is reading. In order to
720          * avoid any problems with containers deadlocking due to this
721          * we simply make /dev/kmsg unavailable to the container. */
722         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
723             asprintf(&to, "%s/proc/kmsg", dest) < 0)
724                 return log_oom();
725
726         if (mkfifo(from, 0600) < 0) {
727                 log_error("mkfifo() for /dev/kmsg failed: %m");
728                 return -errno;
729         }
730
731         r = chmod_and_chown(from, 0600, 0, 0);
732         if (r < 0) {
733                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
734                 return r;
735         }
736
737         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
738                 log_error("Bind mount for /proc/kmsg failed: %m");
739                 return -errno;
740         }
741
742         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
743         if (fd < 0) {
744                 log_error("Failed to open fifo: %m");
745                 return -errno;
746         }
747
748         cmsg = CMSG_FIRSTHDR(&mh);
749         cmsg->cmsg_level = SOL_SOCKET;
750         cmsg->cmsg_type = SCM_RIGHTS;
751         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
752         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
753
754         mh.msg_controllen = cmsg->cmsg_len;
755
756         /* Store away the fd in the socket, so that it stays open as
757          * long as we run the child */
758         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
759         close_nointr_nofail(fd);
760
761         if (k < 0) {
762                 log_error("Failed to send FIFO fd: %m");
763                 return -errno;
764         }
765
766         /* And now make the FIFO unavailable as /dev/kmsg... */
767         unlink(from);
768         return 0;
769 }
770
771 static int setup_hostname(void) {
772
773         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
774                 return -errno;
775
776         return 0;
777 }
778
779 static int setup_journal(const char *directory) {
780         sd_id128_t machine_id;
781         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
782         char *id;
783         int r;
784
785         if (arg_link_journal == LINK_NO)
786                 return 0;
787
788         p = strappend(directory, "/etc/machine-id");
789         if (!p)
790                 return log_oom();
791
792         r = read_one_line_file(p, &b);
793         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
794                 return 0;
795         else if (r < 0) {
796                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
797                 return r;
798         }
799
800         id = strstrip(b);
801         if (isempty(id) && arg_link_journal == LINK_AUTO)
802                 return 0;
803
804         /* Verify validity */
805         r = sd_id128_from_string(id, &machine_id);
806         if (r < 0) {
807                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
808                 return r;
809         }
810
811         free(p);
812         p = strappend("/var/log/journal/", id);
813         q = strjoin(directory, "/var/log/journal/", id, NULL);
814         if (!p || !q)
815                 return log_oom();
816
817         if (path_is_mount_point(p, false) > 0) {
818                 if (arg_link_journal != LINK_AUTO) {
819                         log_error("%s: already a mount point, refusing to use for journal", p);
820                         return -EEXIST;
821                 }
822
823                 return 0;
824         }
825
826         if (path_is_mount_point(q, false) > 0) {
827                 if (arg_link_journal != LINK_AUTO) {
828                         log_error("%s: already a mount point, refusing to use for journal", q);
829                         return -EEXIST;
830                 }
831
832                 return 0;
833         }
834
835         r = readlink_and_make_absolute(p, &d);
836         if (r >= 0) {
837                 if ((arg_link_journal == LINK_GUEST ||
838                      arg_link_journal == LINK_AUTO) &&
839                     path_equal(d, q)) {
840
841                         r = mkdir_p(q, 0755);
842                         if (r < 0)
843                                 log_warning("failed to create directory %s: %m", q);
844                         return 0;
845                 }
846
847                 if (unlink(p) < 0) {
848                         log_error("Failed to remove symlink %s: %m", p);
849                         return -errno;
850                 }
851         } else if (r == -EINVAL) {
852
853                 if (arg_link_journal == LINK_GUEST &&
854                     rmdir(p) < 0) {
855
856                         if (errno == ENOTDIR) {
857                                 log_error("%s already exists and is neither a symlink nor a directory", p);
858                                 return r;
859                         } else {
860                                 log_error("Failed to remove %s: %m", p);
861                                 return -errno;
862                         }
863                 }
864         } else if (r != -ENOENT) {
865                 log_error("readlink(%s) failed: %m", p);
866                 return r;
867         }
868
869         if (arg_link_journal == LINK_GUEST) {
870
871                 if (symlink(q, p) < 0) {
872                         log_error("Failed to symlink %s to %s: %m", q, p);
873                         return -errno;
874                 }
875
876                 r = mkdir_p(q, 0755);
877                 if (r < 0)
878                         log_warning("failed to create directory %s: %m", q);
879                 return 0;
880         }
881
882         if (arg_link_journal == LINK_HOST) {
883                 r = mkdir_p(p, 0755);
884                 if (r < 0) {
885                         log_error("Failed to create %s: %m", p);
886                         return r;
887                 }
888
889         } else if (access(p, F_OK) < 0)
890                 return 0;
891
892         if (dir_is_empty(q) == 0) {
893                 log_error("%s not empty.", q);
894                 return -ENOTEMPTY;
895         }
896
897         r = mkdir_p(q, 0755);
898         if (r < 0) {
899                 log_error("Failed to create %s: %m", q);
900                 return r;
901         }
902
903         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
904                 log_error("Failed to bind mount journal from host into guest: %m");
905                 return -errno;
906         }
907
908         return 0;
909 }
910
911 static int setup_cgroup(const char *path) {
912         char **c;
913         int r;
914
915         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
916         if (r < 0) {
917                 log_error("Failed to create cgroup: %s", strerror(-r));
918                 return r;
919         }
920
921         STRV_FOREACH(c, arg_controllers) {
922                 r = cg_create_and_attach(*c, path, 1);
923                 if (r < 0)
924                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
925         }
926
927         return 0;
928 }
929
930 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
931         _cleanup_free_ char *path = NULL;
932         char buf[DECIMAL_STR_MAX(pid_t)];
933         int r = 0, k;
934
935         assert(cgroup);
936         assert(pid >= 0);
937         assert(arg_directory);
938
939 #ifdef HAVE_XATTR
940         assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
941
942         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
943         if (r < 0) {
944                 log_error("Failed to get path: %s", strerror(-r));
945                 return r;
946         }
947
948         r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
949         if (r < 0)
950                 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
951
952         if (uuid) {
953                 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
954                 if (k < 0) {
955                         log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
956                         if (r == 0)
957                                 r = k;
958                 }
959         }
960
961         k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
962         if (k < 0) {
963                 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
964                 if (r == 0)
965                         r = k;
966         }
967 #endif
968         return r;
969 }
970
971 static int drop_capabilities(void) {
972         return capability_bounding_set_drop(~arg_retain, false);
973 }
974
975 static int process_pty(int master, pid_t pid, sigset_t *mask) {
976
977         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
978         size_t in_buffer_full = 0, out_buffer_full = 0;
979         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
980         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
981         int ep = -1, signal_fd = -1, r;
982         bool tried_orderly_shutdown = false;
983
984         assert(master >= 0);
985         assert(pid > 0);
986         assert(mask);
987
988         fd_nonblock(STDIN_FILENO, 1);
989         fd_nonblock(STDOUT_FILENO, 1);
990         fd_nonblock(master, 1);
991
992         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
993         if (signal_fd < 0) {
994                 log_error("signalfd(): %m");
995                 r = -errno;
996                 goto finish;
997         }
998
999         ep = epoll_create1(EPOLL_CLOEXEC);
1000         if (ep < 0) {
1001                 log_error("Failed to create epoll: %m");
1002                 r = -errno;
1003                 goto finish;
1004         }
1005
1006         /* We read from STDIN only if this is actually a TTY,
1007          * otherwise we assume non-interactivity. */
1008         if (isatty(STDIN_FILENO)) {
1009                 zero(stdin_ev);
1010                 stdin_ev.events = EPOLLIN|EPOLLET;
1011                 stdin_ev.data.fd = STDIN_FILENO;
1012
1013                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1014                         log_error("Failed to register STDIN in epoll: %m");
1015                         r = -errno;
1016                         goto finish;
1017                 }
1018         }
1019
1020         zero(stdout_ev);
1021         stdout_ev.events = EPOLLOUT|EPOLLET;
1022         stdout_ev.data.fd = STDOUT_FILENO;
1023
1024         zero(master_ev);
1025         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1026         master_ev.data.fd = master;
1027
1028         zero(signal_ev);
1029         signal_ev.events = EPOLLIN;
1030         signal_ev.data.fd = signal_fd;
1031
1032         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1033                 if (errno != EPERM) {
1034                         log_error("Failed to register stdout in epoll: %m");
1035                         r = -errno;
1036                         goto finish;
1037                 }
1038                 /* stdout without epoll support. Likely redirected to regular file. */
1039                 stdout_writable = true;
1040         }
1041
1042         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1043             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1044                 log_error("Failed to register fds in epoll: %m");
1045                 r = -errno;
1046                 goto finish;
1047         }
1048
1049         for (;;) {
1050                 struct epoll_event ev[16];
1051                 ssize_t k;
1052                 int i, nfds;
1053
1054                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1055                 if (nfds < 0) {
1056
1057                         if (errno == EINTR || errno == EAGAIN)
1058                                 continue;
1059
1060                         log_error("epoll_wait(): %m");
1061                         r = -errno;
1062                         goto finish;
1063                 }
1064
1065                 assert(nfds >= 1);
1066
1067                 for (i = 0; i < nfds; i++) {
1068                         if (ev[i].data.fd == STDIN_FILENO) {
1069
1070                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1071                                         stdin_readable = true;
1072
1073                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1074
1075                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1076                                         stdout_writable = true;
1077
1078                         } else if (ev[i].data.fd == master) {
1079
1080                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1081                                         master_readable = true;
1082
1083                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1084                                         master_writable = true;
1085
1086                         } else if (ev[i].data.fd == signal_fd) {
1087                                 struct signalfd_siginfo sfsi;
1088                                 ssize_t n;
1089
1090                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1091                                 if (n != sizeof(sfsi)) {
1092
1093                                         if (n >= 0) {
1094                                                 log_error("Failed to read from signalfd: invalid block size");
1095                                                 r = -EIO;
1096                                                 goto finish;
1097                                         }
1098
1099                                         if (errno != EINTR && errno != EAGAIN) {
1100                                                 log_error("Failed to read from signalfd: %m");
1101                                                 r = -errno;
1102                                                 goto finish;
1103                                         }
1104                                 } else {
1105
1106                                         if (sfsi.ssi_signo == SIGWINCH) {
1107                                                 struct winsize ws;
1108
1109                                                 /* The window size changed, let's forward that. */
1110                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1111                                                         ioctl(master, TIOCSWINSZ, &ws);
1112                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1113
1114                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1115
1116                                                 /* This only works for systemd... */
1117                                                 tried_orderly_shutdown = true;
1118                                                 kill(pid, SIGRTMIN+3);
1119
1120                                         } else {
1121                                                 r = 0;
1122                                                 goto finish;
1123                                         }
1124                                 }
1125                         }
1126                 }
1127
1128                 while ((stdin_readable && in_buffer_full <= 0) ||
1129                        (master_writable && in_buffer_full > 0) ||
1130                        (master_readable && out_buffer_full <= 0) ||
1131                        (stdout_writable && out_buffer_full > 0)) {
1132
1133                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1134
1135                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1136                                 if (k < 0) {
1137
1138                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1139                                                 stdin_readable = false;
1140                                         else {
1141                                                 log_error("read(): %m");
1142                                                 r = -errno;
1143                                                 goto finish;
1144                                         }
1145                                 } else
1146                                         in_buffer_full += (size_t) k;
1147                         }
1148
1149                         if (master_writable && in_buffer_full > 0) {
1150
1151                                 k = write(master, in_buffer, in_buffer_full);
1152                                 if (k < 0) {
1153
1154                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1155                                                 master_writable = false;
1156                                         else {
1157                                                 log_error("write(): %m");
1158                                                 r = -errno;
1159                                                 goto finish;
1160                                         }
1161
1162                                 } else {
1163                                         assert(in_buffer_full >= (size_t) k);
1164                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1165                                         in_buffer_full -= k;
1166                                 }
1167                         }
1168
1169                         if (master_readable && out_buffer_full < LINE_MAX) {
1170
1171                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1172                                 if (k < 0) {
1173
1174                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1175                                                 master_readable = false;
1176                                         else {
1177                                                 log_error("read(): %m");
1178                                                 r = -errno;
1179                                                 goto finish;
1180                                         }
1181                                 }  else
1182                                         out_buffer_full += (size_t) k;
1183                         }
1184
1185                         if (stdout_writable && out_buffer_full > 0) {
1186
1187                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1188                                 if (k < 0) {
1189
1190                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1191                                                 stdout_writable = false;
1192                                         else {
1193                                                 log_error("write(): %m");
1194                                                 r = -errno;
1195                                                 goto finish;
1196                                         }
1197
1198                                 } else {
1199                                         assert(out_buffer_full >= (size_t) k);
1200                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1201                                         out_buffer_full -= k;
1202                                 }
1203                         }
1204                 }
1205         }
1206
1207 finish:
1208         if (ep >= 0)
1209                 close_nointr_nofail(ep);
1210
1211         if (signal_fd >= 0)
1212                 close_nointr_nofail(signal_fd);
1213
1214         return r;
1215 }
1216
1217 int main(int argc, char *argv[]) {
1218         pid_t pid = 0;
1219         int r = EXIT_FAILURE, k;
1220         _cleanup_free_ char *machine_root = NULL, *name = NULL, *escaped = NULL, *newcg = NULL;
1221         _cleanup_close_ int master = -1;
1222         int n_fd_passed;
1223         const char *console = NULL;
1224         struct termios saved_attr, raw_attr;
1225         sigset_t mask;
1226         bool saved_attr_valid = false;
1227         struct winsize ws;
1228         int kmsg_socket_pair[2] = { -1, -1 };
1229         FDSet *fds = NULL;
1230
1231         log_parse_environment();
1232         log_open();
1233
1234         k = parse_argv(argc, argv);
1235         if (k < 0)
1236                 goto finish;
1237         else if (k == 0) {
1238                 r = EXIT_SUCCESS;
1239                 goto finish;
1240         }
1241
1242         if (arg_directory) {
1243                 char *p;
1244
1245                 p = path_make_absolute_cwd(arg_directory);
1246                 free(arg_directory);
1247                 arg_directory = p;
1248         } else
1249                 arg_directory = get_current_dir_name();
1250
1251         if (!arg_directory) {
1252                 log_error("Failed to determine path, please use -D.");
1253                 goto finish;
1254         }
1255
1256         path_kill_slashes(arg_directory);
1257
1258         if (!arg_machine) {
1259                 arg_machine = strdup(path_get_file_name(arg_directory));
1260                 if (!arg_machine) {
1261                         log_oom();
1262                         goto finish;
1263                 }
1264
1265                 hostname_cleanup(arg_machine);
1266                 if (isempty(arg_machine)) {
1267                         log_error("Failed to determine machine name automatically, please use -M.");
1268                         goto finish;
1269                 }
1270         }
1271
1272         if (geteuid() != 0) {
1273                 log_error("Need to be root.");
1274                 goto finish;
1275         }
1276
1277         if (sd_booted() <= 0) {
1278                 log_error("Not running on a systemd system.");
1279                 goto finish;
1280         }
1281
1282         if (path_equal(arg_directory, "/")) {
1283                 log_error("Spawning container on root directory not supported.");
1284                 goto finish;
1285         }
1286
1287         if (path_is_os_tree(arg_directory) <= 0) {
1288                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1289                 goto finish;
1290         }
1291
1292         log_close();
1293         n_fd_passed = sd_listen_fds(false);
1294         if (n_fd_passed > 0) {
1295                 k = fdset_new_listen_fds(&fds, false);
1296                 if (k < 0) {
1297                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1298                         goto finish;
1299                 }
1300         }
1301         fdset_close_others(fds);
1302         log_open();
1303
1304         k = cg_get_machine_path(&machine_root);
1305         if (k < 0) {
1306                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1307                 goto finish;
1308         }
1309
1310         name = strappend(arg_machine, ".nspawn");
1311         if (!name) {
1312                 log_oom();
1313                 goto finish;
1314         }
1315
1316         escaped = cg_escape(name);
1317         if (!escaped) {
1318                 log_oom();
1319                 goto finish;
1320         }
1321
1322         newcg = strjoin(machine_root, "/", escaped, NULL);
1323         if (!newcg) {
1324                 log_oom();
1325                 goto finish;
1326         }
1327
1328         k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1329         if (k <= 0 && k != -ENOENT) {
1330                 log_error("Container already running.");
1331
1332                 free(newcg);
1333                 newcg = NULL;
1334
1335                 goto finish;
1336         }
1337
1338         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1339         if (master < 0) {
1340                 log_error("Failed to acquire pseudo tty: %m");
1341                 goto finish;
1342         }
1343
1344         console = ptsname(master);
1345         if (!console) {
1346                 log_error("Failed to determine tty name: %m");
1347                 goto finish;
1348         }
1349
1350         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1351
1352         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1353                 ioctl(master, TIOCSWINSZ, &ws);
1354
1355         if (unlockpt(master) < 0) {
1356                 log_error("Failed to unlock tty: %m");
1357                 goto finish;
1358         }
1359
1360         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1361                 saved_attr_valid = true;
1362
1363                 raw_attr = saved_attr;
1364                 cfmakeraw(&raw_attr);
1365                 raw_attr.c_lflag &= ~ECHO;
1366         }
1367
1368         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1369                 log_error("Failed to create kmsg socket pair.");
1370                 goto finish;
1371         }
1372
1373         sd_notify(0, "READY=1");
1374
1375         assert_se(sigemptyset(&mask) == 0);
1376         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1377         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1378
1379         for (;;) {
1380                 siginfo_t status;
1381                 int pipefd[2], pipefd2[2];
1382
1383                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1384                         log_error("pipe2(): %m");
1385                         goto finish;
1386                 }
1387
1388                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1389                         log_error("pipe2(): %m");
1390                         close_pipe(pipefd);
1391                         goto finish;
1392                 }
1393
1394                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1395                 if (pid < 0) {
1396                         if (errno == EINVAL)
1397                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1398                         else
1399                                 log_error("clone() failed: %m");
1400
1401                         goto finish;
1402                 }
1403
1404                 if (pid == 0) {
1405                         /* child */
1406                         const char *home = NULL;
1407                         uid_t uid = (uid_t) -1;
1408                         gid_t gid = (gid_t) -1;
1409                         unsigned n_env = 2;
1410                         const char *envp[] = {
1411                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1412                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1413                                 NULL, /* TERM */
1414                                 NULL, /* HOME */
1415                                 NULL, /* USER */
1416                                 NULL, /* LOGNAME */
1417                                 NULL, /* container_uuid */
1418                                 NULL, /* LISTEN_FDS */
1419                                 NULL, /* LISTEN_PID */
1420                                 NULL
1421                         };
1422
1423                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1424                         if (envp[n_env])
1425                                 n_env ++;
1426
1427                         /* Wait for the parent process to log our PID */
1428                         close_nointr_nofail(pipefd[1]);
1429                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1430                         close_nointr_nofail(pipefd[0]);
1431
1432                         close_nointr_nofail(master);
1433                         master = -1;
1434
1435                         if (saved_attr_valid) {
1436                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1437                                         log_error("Failed to set terminal attributes: %m");
1438                                         goto child_fail;
1439                                 }
1440                         }
1441
1442                         close_nointr(STDIN_FILENO);
1443                         close_nointr(STDOUT_FILENO);
1444                         close_nointr(STDERR_FILENO);
1445
1446                         close_nointr_nofail(kmsg_socket_pair[0]);
1447                         kmsg_socket_pair[0] = -1;
1448
1449                         reset_all_signal_handlers();
1450
1451                         assert_se(sigemptyset(&mask) == 0);
1452                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1453
1454                         k = open_terminal(console, O_RDWR);
1455                         if (k != STDIN_FILENO) {
1456                                 if (k >= 0) {
1457                                         close_nointr_nofail(k);
1458                                         k = -EINVAL;
1459                                 }
1460
1461                                 log_error("Failed to open console: %s", strerror(-k));
1462                                 goto child_fail;
1463                         }
1464
1465                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1466                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1467                                 log_error("Failed to duplicate console: %m");
1468                                 goto child_fail;
1469                         }
1470
1471                         if (setsid() < 0) {
1472                                 log_error("setsid() failed: %m");
1473                                 goto child_fail;
1474                         }
1475
1476                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1477                                 log_error("PR_SET_PDEATHSIG failed: %m");
1478                                 goto child_fail;
1479                         }
1480
1481                         if (setup_cgroup(newcg) < 0)
1482                                 goto child_fail;
1483
1484                         close_pipe(pipefd2);
1485
1486                         /* Mark everything as slave, so that we still
1487                          * receive mounts from the real root, but don't
1488                          * propagate mounts to the real root. */
1489                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1490                                 log_error("MS_SLAVE|MS_REC failed: %m");
1491                                 goto child_fail;
1492                         }
1493
1494                         /* Turn directory into bind mount */
1495                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1496                                 log_error("Failed to make bind mount.");
1497                                 goto child_fail;
1498                         }
1499
1500                         if (arg_read_only)
1501                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1502                                         log_error("Failed to make read-only.");
1503                                         goto child_fail;
1504                                 }
1505
1506                         if (mount_all(arg_directory) < 0)
1507                                 goto child_fail;
1508
1509                         if (copy_devnodes(arg_directory) < 0)
1510                                 goto child_fail;
1511
1512                         if (setup_ptmx(arg_directory) < 0)
1513                                 goto child_fail;
1514
1515                         dev_setup(arg_directory);
1516
1517                         if (setup_dev_console(arg_directory, console) < 0)
1518                                 goto child_fail;
1519
1520                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1521                                 goto child_fail;
1522
1523                         close_nointr_nofail(kmsg_socket_pair[1]);
1524                         kmsg_socket_pair[1] = -1;
1525
1526                         if (setup_boot_id(arg_directory) < 0)
1527                                 goto child_fail;
1528
1529                         if (setup_timezone(arg_directory) < 0)
1530                                 goto child_fail;
1531
1532                         if (setup_resolv_conf(arg_directory) < 0)
1533                                 goto child_fail;
1534
1535                         if (setup_journal(arg_directory) < 0)
1536                                 goto child_fail;
1537
1538                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1539                                 goto child_fail;
1540
1541                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1542                                 goto child_fail;
1543
1544                         if (chdir(arg_directory) < 0) {
1545                                 log_error("chdir(%s) failed: %m", arg_directory);
1546                                 goto child_fail;
1547                         }
1548
1549                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1550                                 log_error("mount(MS_MOVE) failed: %m");
1551                                 goto child_fail;
1552                         }
1553
1554                         if (chroot(".") < 0) {
1555                                 log_error("chroot() failed: %m");
1556                                 goto child_fail;
1557                         }
1558
1559                         if (chdir("/") < 0) {
1560                                 log_error("chdir() failed: %m");
1561                                 goto child_fail;
1562                         }
1563
1564                         umask(0022);
1565
1566                         loopback_setup();
1567
1568                         if (drop_capabilities() < 0) {
1569                                 log_error("drop_capabilities() failed: %m");
1570                                 goto child_fail;
1571                         }
1572
1573                         if (arg_user) {
1574
1575                                 /* Note that this resolves user names
1576                                  * inside the container, and hence
1577                                  * accesses the NSS modules from the
1578                                  * container and not the host. This is
1579                                  * a bit weird... */
1580
1581                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1582                                         log_error("get_user_creds() failed: %m");
1583                                         goto child_fail;
1584                                 }
1585
1586                                 if (mkdir_parents_label(home, 0775) < 0) {
1587                                         log_error("mkdir_parents_label() failed: %m");
1588                                         goto child_fail;
1589                                 }
1590
1591                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1592                                         log_error("mkdir_safe_label() failed: %m");
1593                                         goto child_fail;
1594                                 }
1595
1596                                 if (initgroups((const char*)arg_user, gid) < 0) {
1597                                         log_error("initgroups() failed: %m");
1598                                         goto child_fail;
1599                                 }
1600
1601                                 if (setresgid(gid, gid, gid) < 0) {
1602                                         log_error("setregid() failed: %m");
1603                                         goto child_fail;
1604                                 }
1605
1606                                 if (setresuid(uid, uid, uid) < 0) {
1607                                         log_error("setreuid() failed: %m");
1608                                         goto child_fail;
1609                                 }
1610                         } else {
1611                                 /* Reset everything fully to 0, just in case */
1612
1613                                 if (setgroups(0, NULL) < 0) {
1614                                         log_error("setgroups() failed: %m");
1615                                         goto child_fail;
1616                                 }
1617
1618                                 if (setresgid(0, 0, 0) < 0) {
1619                                         log_error("setregid() failed: %m");
1620                                         goto child_fail;
1621                                 }
1622
1623                                 if (setresuid(0, 0, 0) < 0) {
1624                                         log_error("setreuid() failed: %m");
1625                                         goto child_fail;
1626                                 }
1627                         }
1628
1629                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1630                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1631                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1632                                 log_oom();
1633                                 goto child_fail;
1634                         }
1635
1636                         if (arg_uuid) {
1637                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1638                                         log_oom();
1639                                         goto child_fail;
1640                                 }
1641                         }
1642
1643                         if (fdset_size(fds) > 0) {
1644                                 k = fdset_cloexec(fds, false);
1645                                 if (k < 0) {
1646                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1647                                         goto child_fail;
1648                                 }
1649
1650                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1651                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1652                                         log_oom();
1653                                         goto child_fail;
1654                                 }
1655                         }
1656
1657                         setup_hostname();
1658
1659                         if (arg_boot) {
1660                                 char **a;
1661                                 size_t l;
1662
1663                                 /* Automatically search for the init system */
1664
1665                                 l = 1 + argc - optind;
1666                                 a = newa(char*, l + 1);
1667                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1668
1669                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1670                                 execve(a[0], a, (char**) envp);
1671
1672                                 a[0] = (char*) "/lib/systemd/systemd";
1673                                 execve(a[0], a, (char**) envp);
1674
1675                                 a[0] = (char*) "/sbin/init";
1676                                 execve(a[0], a, (char**) envp);
1677                         } else if (argc > optind)
1678                                 execvpe(argv[optind], argv + optind, (char**) envp);
1679                         else {
1680                                 chdir(home ? home : "/root");
1681                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1682                         }
1683
1684                         log_error("execv() failed: %m");
1685
1686                 child_fail:
1687                         _exit(EXIT_FAILURE);
1688                 }
1689
1690                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1691                 close_nointr_nofail(pipefd[0]);
1692                 close_nointr_nofail(pipefd[1]);
1693
1694                 /* Wait for the child process to establish cgroup hierarchy */
1695                 close_nointr_nofail(pipefd2[1]);
1696                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1697                 close_nointr_nofail(pipefd2[0]);
1698
1699                 save_attributes(newcg, pid, arg_uuid, arg_directory);
1700
1701                 fdset_free(fds);
1702                 fds = NULL;
1703
1704                 if (process_pty(master, pid, &mask) < 0)
1705                         goto finish;
1706
1707                 if (saved_attr_valid)
1708                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1709
1710                 k = wait_for_terminate(pid, &status);
1711                 if (k < 0) {
1712                         r = EXIT_FAILURE;
1713                         break;
1714                 }
1715
1716                 if (status.si_code == CLD_EXITED) {
1717                         if (status.si_status != 0) {
1718                                 log_error("Container failed with error code %i.", status.si_status);
1719                                 r = status.si_status;
1720                                 break;
1721                         }
1722
1723                         log_debug("Container exited successfully.");
1724                         break;
1725                 } else if (status.si_code == CLD_KILLED &&
1726                            status.si_status == SIGINT) {
1727                         log_info("Container has been shut down.");
1728                         r = 0;
1729                         break;
1730                 } else if (status.si_code == CLD_KILLED &&
1731                            status.si_status == SIGHUP) {
1732                         log_info("Container is being rebooted.");
1733                         continue;
1734                 } else if (status.si_code == CLD_KILLED ||
1735                            status.si_code == CLD_DUMPED) {
1736
1737                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1738                         r = EXIT_FAILURE;
1739                         break;
1740                 } else {
1741                         log_error("Container failed due to unknown reason.");
1742                         r = EXIT_FAILURE;
1743                         break;
1744                 }
1745         }
1746
1747 finish:
1748         if (saved_attr_valid)
1749                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1750
1751         close_pipe(kmsg_socket_pair);
1752
1753         if (newcg)
1754                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1755
1756         free(arg_directory);
1757         free(arg_machine);
1758         strv_free(arg_controllers);
1759
1760         fdset_free(fds);
1761
1762         return r;
1763 }