chiark / gitweb /
5a43d5ed127b3d5a33eebbf4c3c791ebf49bc034
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 #ifndef TTY_GID
64 #define TTY_GID 5
65 #endif
66
67 typedef enum LinkJournal {
68         LINK_NO,
69         LINK_AUTO,
70         LINK_HOST,
71         LINK_GUEST
72 } LinkJournal;
73
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static char *arg_machine = NULL;
79 static bool arg_private_network = false;
80 static bool arg_read_only = false;
81 static bool arg_boot = false;
82 static LinkJournal arg_link_journal = LINK_AUTO;
83 static uint64_t arg_retain =
84         (1ULL << CAP_CHOWN) |
85         (1ULL << CAP_DAC_OVERRIDE) |
86         (1ULL << CAP_DAC_READ_SEARCH) |
87         (1ULL << CAP_FOWNER) |
88         (1ULL << CAP_FSETID) |
89         (1ULL << CAP_IPC_OWNER) |
90         (1ULL << CAP_KILL) |
91         (1ULL << CAP_LEASE) |
92         (1ULL << CAP_LINUX_IMMUTABLE) |
93         (1ULL << CAP_NET_BIND_SERVICE) |
94         (1ULL << CAP_NET_BROADCAST) |
95         (1ULL << CAP_NET_RAW) |
96         (1ULL << CAP_SETGID) |
97         (1ULL << CAP_SETFCAP) |
98         (1ULL << CAP_SETPCAP) |
99         (1ULL << CAP_SETUID) |
100         (1ULL << CAP_SYS_ADMIN) |
101         (1ULL << CAP_SYS_CHROOT) |
102         (1ULL << CAP_SYS_NICE) |
103         (1ULL << CAP_SYS_PTRACE) |
104         (1ULL << CAP_SYS_TTY_CONFIG) |
105         (1ULL << CAP_SYS_RESOURCE) |
106         (1ULL << CAP_SYS_BOOT) |
107         (1ULL << CAP_AUDIT_WRITE) |
108         (1ULL << CAP_AUDIT_CONTROL);
109 static char **arg_bind = NULL;
110 static char **arg_bind_ro = NULL;
111
112 static int help(void) {
113
114         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
115                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
116                "  -h --help                Show this help\n"
117                "     --version             Print version string\n"
118                "  -D --directory=NAME      Root directory for the container\n"
119                "  -b --boot                Boot up full system (i.e. invoke init)\n"
120                "  -u --user=USER           Run the command under specified user or uid\n"
121                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
122                "                           cgroup hierarchies\n"
123                "     --uuid=UUID           Set a specific machine UUID for the container\n"
124                "  -M --machine=NAME        Set the machine name for the container\n"
125                "     --private-network     Disable network in container\n"
126                "     --read-only           Mount the root directory read-only\n"
127                "     --capability=CAP      In addition to the default, retain specified\n"
128                "                           capability\n"
129                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
130                "  -j                       Equivalent to --link-journal=host\n"
131                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
132                "                           the container\n"
133                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
134                program_invocation_short_name);
135
136         return 0;
137 }
138
139 static int parse_argv(int argc, char *argv[]) {
140
141         enum {
142                 ARG_VERSION = 0x100,
143                 ARG_PRIVATE_NETWORK,
144                 ARG_UUID,
145                 ARG_READ_ONLY,
146                 ARG_CAPABILITY,
147                 ARG_LINK_JOURNAL,
148                 ARG_BIND,
149                 ARG_BIND_RO
150         };
151
152         static const struct option options[] = {
153                 { "help",            no_argument,       NULL, 'h'                 },
154                 { "version",         no_argument,       NULL, ARG_VERSION         },
155                 { "directory",       required_argument, NULL, 'D'                 },
156                 { "user",            required_argument, NULL, 'u'                 },
157                 { "controllers",     required_argument, NULL, 'C'                 },
158                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
159                 { "boot",            no_argument,       NULL, 'b'                 },
160                 { "uuid",            required_argument, NULL, ARG_UUID            },
161                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
162                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
163                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
164                 { "bind",            required_argument, NULL, ARG_BIND            },
165                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
166                 { "machine",         required_argument, NULL, 'M'                 },
167                 { NULL,              0,                 NULL, 0                   }
168         };
169
170         int c;
171
172         assert(argc >= 0);
173         assert(argv);
174
175         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
176
177                 switch (c) {
178
179                 case 'h':
180                         help();
181                         return 0;
182
183                 case ARG_VERSION:
184                         puts(PACKAGE_STRING);
185                         puts(SYSTEMD_FEATURES);
186                         return 0;
187
188                 case 'D':
189                         free(arg_directory);
190                         arg_directory = canonicalize_file_name(optarg);
191                         if (!arg_directory) {
192                                 log_error("Failed to canonicalize root directory.");
193                                 return -ENOMEM;
194                         }
195
196                         break;
197
198                 case 'u':
199                         free(arg_user);
200                         arg_user = strdup(optarg);
201                         if (!arg_user)
202                                 return log_oom();
203
204                         break;
205
206                 case 'C':
207                         strv_free(arg_controllers);
208                         arg_controllers = strv_split(optarg, ",");
209                         if (!arg_controllers)
210                                 return log_oom();
211
212                         cg_shorten_controllers(arg_controllers);
213                         break;
214
215                 case ARG_PRIVATE_NETWORK:
216                         arg_private_network = true;
217                         break;
218
219                 case 'b':
220                         arg_boot = true;
221                         break;
222
223                 case ARG_UUID:
224                         arg_uuid = optarg;
225                         break;
226
227                 case 'M':
228                         if (!hostname_is_valid(optarg)) {
229                                 log_error("Invalid machine name: %s", optarg);
230                                 return -EINVAL;
231                         }
232
233                         free(arg_machine);
234                         arg_machine = strdup(optarg);
235                         if (!arg_machine)
236                                 return log_oom();
237
238                         break;
239
240                 case ARG_READ_ONLY:
241                         arg_read_only = true;
242                         break;
243
244                 case ARG_CAPABILITY: {
245                         char *state, *word;
246                         size_t length;
247
248                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
249                                 cap_value_t cap;
250                                 char *t;
251
252                                 t = strndup(word, length);
253                                 if (!t)
254                                         return log_oom();
255
256                                 if (cap_from_name(t, &cap) < 0) {
257                                         log_error("Failed to parse capability %s.", t);
258                                         free(t);
259                                         return -EINVAL;
260                                 }
261
262                                 free(t);
263                                 arg_retain |= 1ULL << (uint64_t) cap;
264                         }
265
266                         break;
267                 }
268
269                 case 'j':
270                         arg_link_journal = LINK_GUEST;
271                         break;
272
273                 case ARG_LINK_JOURNAL:
274                         if (streq(optarg, "auto"))
275                                 arg_link_journal = LINK_AUTO;
276                         else if (streq(optarg, "no"))
277                                 arg_link_journal = LINK_NO;
278                         else if (streq(optarg, "guest"))
279                                 arg_link_journal = LINK_GUEST;
280                         else if (streq(optarg, "host"))
281                                 arg_link_journal = LINK_HOST;
282                         else {
283                                 log_error("Failed to parse link journal mode %s", optarg);
284                                 return -EINVAL;
285                         }
286
287                         break;
288
289                 case ARG_BIND:
290                 case ARG_BIND_RO: {
291                         _cleanup_free_ char *a = NULL, *b = NULL;
292                         char *e;
293                         char ***x;
294                         int r;
295
296                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
297
298                         e = strchr(optarg, ':');
299                         if (e) {
300                                 a = strndup(optarg, e - optarg);
301                                 b = strdup(e + 1);
302                         } else {
303                                 a = strdup(optarg);
304                                 b = strdup(optarg);
305                         }
306
307                         if (!a || !b)
308                                 return log_oom();
309
310                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
311                                 log_error("Invalid bind mount specification: %s", optarg);
312                                 return -EINVAL;
313                         }
314
315                         r = strv_extend(x, a);
316                         if (r < 0)
317                                 return r;
318
319                         r = strv_extend(x, b);
320                         if (r < 0)
321                                 return r;
322
323                         break;
324                 }
325
326                 case '?':
327                         return -EINVAL;
328
329                 default:
330                         log_error("Unknown option code %c", c);
331                         return -EINVAL;
332                 }
333         }
334
335         return 1;
336 }
337
338 static int mount_all(const char *dest) {
339
340         typedef struct MountPoint {
341                 const char *what;
342                 const char *where;
343                 const char *type;
344                 const char *options;
345                 unsigned long flags;
346                 bool fatal;
347         } MountPoint;
348
349         static const MountPoint mount_table[] = {
350                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
351                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
352                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
353                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
354                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
355                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
357                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358 #ifdef HAVE_SELINUX
359                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
360                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
361 #endif
362         };
363
364         unsigned k;
365         int r = 0;
366
367         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368                 _cleanup_free_ char *where = NULL;
369                 int t;
370
371                 where = strjoin(dest, "/", mount_table[k].where, NULL);
372                 if (!where)
373                         return log_oom();
374
375                 t = path_is_mount_point(where, true);
376                 if (t < 0) {
377                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
378
379                         if (r == 0)
380                                 r = t;
381
382                         continue;
383                 }
384
385                 /* Skip this entry if it is not a remount. */
386                 if (mount_table[k].what && t > 0)
387                         continue;
388
389                 mkdir_p(where, 0755);
390
391                 if (mount(mount_table[k].what,
392                           where,
393                           mount_table[k].type,
394                           mount_table[k].flags,
395                           mount_table[k].options) < 0 &&
396                     mount_table[k].fatal) {
397
398                         log_error("mount(%s) failed: %m", where);
399
400                         if (r == 0)
401                                 r = -errno;
402                 }
403         }
404
405         return r;
406 }
407
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
409         char **x, **y;
410
411         STRV_FOREACH_PAIR(x, y, l) {
412                 _cleanup_free_ char *where = NULL;
413
414                 where = strjoin(dest, "/", *y, NULL);
415                 if (!where)
416                         return log_oom();
417
418                 mkdir_p_label(where, 0755);
419
420                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
421                         log_error("mount(%s) failed: %m", where);
422                         return -errno;
423                 }
424
425                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
426                         log_error("mount(%s) failed: %m", where);
427                         return -errno;
428                 }
429         }
430
431         return 0;
432 }
433
434 static int setup_timezone(const char *dest) {
435         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
436         char *z, *y;
437         int r;
438
439         assert(dest);
440
441         /* Fix the timezone, if possible */
442         r = readlink_malloc("/etc/localtime", &p);
443         if (r < 0) {
444                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
445                 return 0;
446         }
447
448         z = path_startswith(p, "../usr/share/zoneinfo/");
449         if (!z)
450                 z = path_startswith(p, "/usr/share/zoneinfo/");
451         if (!z) {
452                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
453                 return 0;
454         }
455
456         where = strappend(dest, "/etc/localtime");
457         if (!where)
458                 return log_oom();
459
460         r = readlink_malloc(where, &q);
461         if (r >= 0) {
462                 y = path_startswith(q, "../usr/share/zoneinfo/");
463                 if (!y)
464                         y = path_startswith(q, "/usr/share/zoneinfo/");
465
466
467                 /* Already pointing to the right place? Then do nothing .. */
468                 if (y && streq(y, z))
469                         return 0;
470         }
471
472         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
473         if (!check)
474                 return log_oom();
475
476         if (access(check, F_OK) < 0) {
477                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
478                 return 0;
479         }
480
481         what = strappend("../usr/share/zoneinfo/", z);
482         if (!what)
483                 return log_oom();
484
485         unlink(where);
486         if (symlink(what, where) < 0) {
487                 log_error("Failed to correct timezone of container: %m");
488                 return 0;
489         }
490
491         return 0;
492 }
493
494 static int setup_resolv_conf(const char *dest) {
495         char _cleanup_free_ *where = NULL;
496         _cleanup_close_ int fd = -1;
497
498         assert(dest);
499
500         if (arg_private_network)
501                 return 0;
502
503         /* Fix resolv.conf, if possible */
504         where = strappend(dest, "/etc/resolv.conf");
505         if (!where)
506                 return log_oom();
507
508         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
509
510         /* We don't really care for the results of this really. If it
511          * fails, it fails, but meh... */
512         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
513                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
514         else
515                 if (mount("/etc/resolv.conf", where, "bind",
516                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
517                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
518                         return -errno;
519                 }
520
521         return 0;
522 }
523
524 static int setup_boot_id(const char *dest) {
525         _cleanup_free_ char *from = NULL, *to = NULL;
526         sd_id128_t rnd;
527         char as_uuid[37];
528         int r;
529
530         assert(dest);
531
532         /* Generate a new randomized boot ID, so that each boot-up of
533          * the container gets a new one */
534
535         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
536         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
537         if (!from || !to)
538                 return log_oom();
539
540         r = sd_id128_randomize(&rnd);
541         if (r < 0) {
542                 log_error("Failed to generate random boot id: %s", strerror(-r));
543                 return r;
544         }
545
546         snprintf(as_uuid, sizeof(as_uuid),
547                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
548                  SD_ID128_FORMAT_VAL(rnd));
549         char_array_0(as_uuid);
550
551         r = write_string_file(from, as_uuid);
552         if (r < 0) {
553                 log_error("Failed to write boot id: %s", strerror(-r));
554                 return r;
555         }
556
557         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
558                 log_error("Failed to bind mount boot id: %m");
559                 r = -errno;
560         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
561                 log_warning("Failed to make boot id read-only: %m");
562
563         unlink(from);
564         return r;
565 }
566
567 static int copy_devnodes(const char *dest) {
568
569         static const char devnodes[] =
570                 "null\0"
571                 "zero\0"
572                 "full\0"
573                 "random\0"
574                 "urandom\0"
575                 "tty\0";
576
577         const char *d;
578         int r = 0;
579         _cleanup_umask_ mode_t u;
580
581         assert(dest);
582
583         u = umask(0000);
584
585         NULSTR_FOREACH(d, devnodes) {
586                 struct stat st;
587                 _cleanup_free_ char *from = NULL, *to = NULL;
588
589                 asprintf(&from, "/dev/%s", d);
590                 asprintf(&to, "%s/dev/%s", dest, d);
591
592                 if (!from || !to) {
593                         log_oom();
594
595                         if (r == 0)
596                                 r = -ENOMEM;
597
598                         break;
599                 }
600
601                 if (stat(from, &st) < 0) {
602
603                         if (errno != ENOENT) {
604                                 log_error("Failed to stat %s: %m", from);
605                                 if (r == 0)
606                                         r = -errno;
607                         }
608
609                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
610
611                         log_error("%s is not a char or block device, cannot copy", from);
612                         if (r == 0)
613                                 r = -EIO;
614
615                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
616
617                         log_error("mknod(%s) failed: %m", dest);
618                         if (r == 0)
619                                 r = -errno;
620                 }
621         }
622
623         return r;
624 }
625
626 static int setup_ptmx(const char *dest) {
627         _cleanup_free_ char *p = NULL;
628
629         p = strappend(dest, "/dev/ptmx");
630         if (!p)
631                 return log_oom();
632
633         if (symlink("pts/ptmx", p) < 0) {
634                 log_error("Failed to create /dev/ptmx symlink: %m");
635                 return -errno;
636         }
637
638         return 0;
639 }
640
641 static int setup_dev_console(const char *dest, const char *console) {
642         struct stat st;
643         _cleanup_free_ char *to = NULL;
644         int r;
645         _cleanup_umask_ mode_t u;
646
647         assert(dest);
648         assert(console);
649
650         u = umask(0000);
651
652         if (stat(console, &st) < 0) {
653                 log_error("Failed to stat %s: %m", console);
654                 return -errno;
655
656         } else if (!S_ISCHR(st.st_mode)) {
657                 log_error("/dev/console is not a char device");
658                 return -EIO;
659         }
660
661         r = chmod_and_chown(console, 0600, 0, 0);
662         if (r < 0) {
663                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
664                 return r;
665         }
666
667         if (asprintf(&to, "%s/dev/console", dest) < 0)
668                 return log_oom();
669
670         /* We need to bind mount the right tty to /dev/console since
671          * ptys can only exist on pts file systems. To have something
672          * to bind mount things on we create a device node first, that
673          * has the right major/minor (note that the major minor
674          * doesn't actually matter here, since we mount it over
675          * anyway). */
676
677         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
678                 log_error("mknod() for /dev/console failed: %m");
679                 return -errno;
680         }
681
682         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
683                 log_error("Bind mount for /dev/console failed: %m");
684                 return -errno;
685         }
686
687         return 0;
688 }
689
690 static int setup_kmsg(const char *dest, int kmsg_socket) {
691         _cleanup_free_ char *from = NULL, *to = NULL;
692         int r, fd, k;
693         _cleanup_umask_ mode_t u;
694         union {
695                 struct cmsghdr cmsghdr;
696                 uint8_t buf[CMSG_SPACE(sizeof(int))];
697         } control = {};
698         struct msghdr mh = {
699                 .msg_control = &control,
700                 .msg_controllen = sizeof(control),
701         };
702         struct cmsghdr *cmsg;
703
704         assert(dest);
705         assert(kmsg_socket >= 0);
706
707         u = umask(0000);
708
709         /* We create the kmsg FIFO as /dev/kmsg, but immediately
710          * delete it after bind mounting it to /proc/kmsg. While FIFOs
711          * on the reading side behave very similar to /proc/kmsg,
712          * their writing side behaves differently from /dev/kmsg in
713          * that writing blocks when nothing is reading. In order to
714          * avoid any problems with containers deadlocking due to this
715          * we simply make /dev/kmsg unavailable to the container. */
716         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
717             asprintf(&to, "%s/proc/kmsg", dest) < 0)
718                 return log_oom();
719
720         if (mkfifo(from, 0600) < 0) {
721                 log_error("mkfifo() for /dev/kmsg failed: %m");
722                 return -errno;
723         }
724
725         r = chmod_and_chown(from, 0600, 0, 0);
726         if (r < 0) {
727                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
728                 return r;
729         }
730
731         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
732                 log_error("Bind mount for /proc/kmsg failed: %m");
733                 return -errno;
734         }
735
736         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
737         if (fd < 0) {
738                 log_error("Failed to open fifo: %m");
739                 return -errno;
740         }
741
742         cmsg = CMSG_FIRSTHDR(&mh);
743         cmsg->cmsg_level = SOL_SOCKET;
744         cmsg->cmsg_type = SCM_RIGHTS;
745         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
746         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
747
748         mh.msg_controllen = cmsg->cmsg_len;
749
750         /* Store away the fd in the socket, so that it stays open as
751          * long as we run the child */
752         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
753         close_nointr_nofail(fd);
754
755         if (k < 0) {
756                 log_error("Failed to send FIFO fd: %m");
757                 return -errno;
758         }
759
760         /* And now make the FIFO unavailable as /dev/kmsg... */
761         unlink(from);
762         return 0;
763 }
764
765 static int setup_hostname(void) {
766
767         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
768                 return -errno;
769
770         return 0;
771 }
772
773 static int setup_journal(const char *directory) {
774         sd_id128_t machine_id;
775         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
776         char *id;
777         int r;
778
779         if (arg_link_journal == LINK_NO)
780                 return 0;
781
782         p = strappend(directory, "/etc/machine-id");
783         if (!p)
784                 return log_oom();
785
786         r = read_one_line_file(p, &b);
787         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
788                 return 0;
789         else if (r < 0) {
790                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
791                 return r;
792         }
793
794         id = strstrip(b);
795         if (isempty(id) && arg_link_journal == LINK_AUTO)
796                 return 0;
797
798         /* Verify validity */
799         r = sd_id128_from_string(id, &machine_id);
800         if (r < 0) {
801                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
802                 return r;
803         }
804
805         free(p);
806         p = strappend("/var/log/journal/", id);
807         q = strjoin(directory, "/var/log/journal/", id, NULL);
808         if (!p || !q)
809                 return log_oom();
810
811         if (path_is_mount_point(p, false) > 0) {
812                 if (arg_link_journal != LINK_AUTO) {
813                         log_error("%s: already a mount point, refusing to use for journal", p);
814                         return -EEXIST;
815                 }
816
817                 return 0;
818         }
819
820         if (path_is_mount_point(q, false) > 0) {
821                 if (arg_link_journal != LINK_AUTO) {
822                         log_error("%s: already a mount point, refusing to use for journal", q);
823                         return -EEXIST;
824                 }
825
826                 return 0;
827         }
828
829         r = readlink_and_make_absolute(p, &d);
830         if (r >= 0) {
831                 if ((arg_link_journal == LINK_GUEST ||
832                      arg_link_journal == LINK_AUTO) &&
833                     path_equal(d, q)) {
834
835                         r = mkdir_p(q, 0755);
836                         if (r < 0)
837                                 log_warning("failed to create directory %s: %m", q);
838                         return 0;
839                 }
840
841                 if (unlink(p) < 0) {
842                         log_error("Failed to remove symlink %s: %m", p);
843                         return -errno;
844                 }
845         } else if (r == -EINVAL) {
846
847                 if (arg_link_journal == LINK_GUEST &&
848                     rmdir(p) < 0) {
849
850                         if (errno == ENOTDIR) {
851                                 log_error("%s already exists and is neither a symlink nor a directory", p);
852                                 return r;
853                         } else {
854                                 log_error("Failed to remove %s: %m", p);
855                                 return -errno;
856                         }
857                 }
858         } else if (r != -ENOENT) {
859                 log_error("readlink(%s) failed: %m", p);
860                 return r;
861         }
862
863         if (arg_link_journal == LINK_GUEST) {
864
865                 if (symlink(q, p) < 0) {
866                         log_error("Failed to symlink %s to %s: %m", q, p);
867                         return -errno;
868                 }
869
870                 r = mkdir_p(q, 0755);
871                 if (r < 0)
872                         log_warning("failed to create directory %s: %m", q);
873                 return 0;
874         }
875
876         if (arg_link_journal == LINK_HOST) {
877                 r = mkdir_p(p, 0755);
878                 if (r < 0) {
879                         log_error("Failed to create %s: %m", p);
880                         return r;
881                 }
882
883         } else if (access(p, F_OK) < 0)
884                 return 0;
885
886         if (dir_is_empty(q) == 0) {
887                 log_error("%s not empty.", q);
888                 return -ENOTEMPTY;
889         }
890
891         r = mkdir_p(q, 0755);
892         if (r < 0) {
893                 log_error("Failed to create %s: %m", q);
894                 return r;
895         }
896
897         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
898                 log_error("Failed to bind mount journal from host into guest: %m");
899                 return -errno;
900         }
901
902         return 0;
903 }
904
905 static int setup_cgroup(const char *path) {
906         char **c;
907         int r;
908
909         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
910         if (r < 0) {
911                 log_error("Failed to create cgroup: %s", strerror(-r));
912                 return r;
913         }
914
915         STRV_FOREACH(c, arg_controllers) {
916                 r = cg_create_and_attach(*c, path, 1);
917                 if (r < 0)
918                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
919         }
920
921         return 0;
922 }
923
924 static int drop_capabilities(void) {
925         return capability_bounding_set_drop(~arg_retain, false);
926 }
927
928 static int process_pty(int master, pid_t pid, sigset_t *mask) {
929
930         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
931         size_t in_buffer_full = 0, out_buffer_full = 0;
932         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
933         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
934         int ep = -1, signal_fd = -1, r;
935         bool tried_orderly_shutdown = false;
936
937         assert(master >= 0);
938         assert(pid > 0);
939         assert(mask);
940
941         fd_nonblock(STDIN_FILENO, 1);
942         fd_nonblock(STDOUT_FILENO, 1);
943         fd_nonblock(master, 1);
944
945         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
946         if (signal_fd < 0) {
947                 log_error("signalfd(): %m");
948                 r = -errno;
949                 goto finish;
950         }
951
952         ep = epoll_create1(EPOLL_CLOEXEC);
953         if (ep < 0) {
954                 log_error("Failed to create epoll: %m");
955                 r = -errno;
956                 goto finish;
957         }
958
959         /* We read from STDIN only if this is actually a TTY,
960          * otherwise we assume non-interactivity. */
961         if (isatty(STDIN_FILENO)) {
962                 zero(stdin_ev);
963                 stdin_ev.events = EPOLLIN|EPOLLET;
964                 stdin_ev.data.fd = STDIN_FILENO;
965
966                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
967                         log_error("Failed to register STDIN in epoll: %m");
968                         r = -errno;
969                         goto finish;
970                 }
971         }
972
973         zero(stdout_ev);
974         stdout_ev.events = EPOLLOUT|EPOLLET;
975         stdout_ev.data.fd = STDOUT_FILENO;
976
977         zero(master_ev);
978         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
979         master_ev.data.fd = master;
980
981         zero(signal_ev);
982         signal_ev.events = EPOLLIN;
983         signal_ev.data.fd = signal_fd;
984
985         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
986                 if (errno != EPERM) {
987                         log_error("Failed to register stdout in epoll: %m");
988                         r = -errno;
989                         goto finish;
990                 }
991                 /* stdout without epoll support. Likely redirected to regular file. */
992                 stdout_writable = true;
993         }
994
995         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
996             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
997                 log_error("Failed to register fds in epoll: %m");
998                 r = -errno;
999                 goto finish;
1000         }
1001
1002         for (;;) {
1003                 struct epoll_event ev[16];
1004                 ssize_t k;
1005                 int i, nfds;
1006
1007                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1008                 if (nfds < 0) {
1009
1010                         if (errno == EINTR || errno == EAGAIN)
1011                                 continue;
1012
1013                         log_error("epoll_wait(): %m");
1014                         r = -errno;
1015                         goto finish;
1016                 }
1017
1018                 assert(nfds >= 1);
1019
1020                 for (i = 0; i < nfds; i++) {
1021                         if (ev[i].data.fd == STDIN_FILENO) {
1022
1023                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1024                                         stdin_readable = true;
1025
1026                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1027
1028                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1029                                         stdout_writable = true;
1030
1031                         } else if (ev[i].data.fd == master) {
1032
1033                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1034                                         master_readable = true;
1035
1036                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1037                                         master_writable = true;
1038
1039                         } else if (ev[i].data.fd == signal_fd) {
1040                                 struct signalfd_siginfo sfsi;
1041                                 ssize_t n;
1042
1043                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1044                                 if (n != sizeof(sfsi)) {
1045
1046                                         if (n >= 0) {
1047                                                 log_error("Failed to read from signalfd: invalid block size");
1048                                                 r = -EIO;
1049                                                 goto finish;
1050                                         }
1051
1052                                         if (errno != EINTR && errno != EAGAIN) {
1053                                                 log_error("Failed to read from signalfd: %m");
1054                                                 r = -errno;
1055                                                 goto finish;
1056                                         }
1057                                 } else {
1058
1059                                         if (sfsi.ssi_signo == SIGWINCH) {
1060                                                 struct winsize ws;
1061
1062                                                 /* The window size changed, let's forward that. */
1063                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1064                                                         ioctl(master, TIOCSWINSZ, &ws);
1065                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1066
1067                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1068
1069                                                 /* This only works for systemd... */
1070                                                 tried_orderly_shutdown = true;
1071                                                 kill(pid, SIGRTMIN+3);
1072
1073                                         } else {
1074                                                 r = 0;
1075                                                 goto finish;
1076                                         }
1077                                 }
1078                         }
1079                 }
1080
1081                 while ((stdin_readable && in_buffer_full <= 0) ||
1082                        (master_writable && in_buffer_full > 0) ||
1083                        (master_readable && out_buffer_full <= 0) ||
1084                        (stdout_writable && out_buffer_full > 0)) {
1085
1086                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1087
1088                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1089                                 if (k < 0) {
1090
1091                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1092                                                 stdin_readable = false;
1093                                         else {
1094                                                 log_error("read(): %m");
1095                                                 r = -errno;
1096                                                 goto finish;
1097                                         }
1098                                 } else
1099                                         in_buffer_full += (size_t) k;
1100                         }
1101
1102                         if (master_writable && in_buffer_full > 0) {
1103
1104                                 k = write(master, in_buffer, in_buffer_full);
1105                                 if (k < 0) {
1106
1107                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1108                                                 master_writable = false;
1109                                         else {
1110                                                 log_error("write(): %m");
1111                                                 r = -errno;
1112                                                 goto finish;
1113                                         }
1114
1115                                 } else {
1116                                         assert(in_buffer_full >= (size_t) k);
1117                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1118                                         in_buffer_full -= k;
1119                                 }
1120                         }
1121
1122                         if (master_readable && out_buffer_full < LINE_MAX) {
1123
1124                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1125                                 if (k < 0) {
1126
1127                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1128                                                 master_readable = false;
1129                                         else {
1130                                                 log_error("read(): %m");
1131                                                 r = -errno;
1132                                                 goto finish;
1133                                         }
1134                                 }  else
1135                                         out_buffer_full += (size_t) k;
1136                         }
1137
1138                         if (stdout_writable && out_buffer_full > 0) {
1139
1140                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1141                                 if (k < 0) {
1142
1143                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1144                                                 stdout_writable = false;
1145                                         else {
1146                                                 log_error("write(): %m");
1147                                                 r = -errno;
1148                                                 goto finish;
1149                                         }
1150
1151                                 } else {
1152                                         assert(out_buffer_full >= (size_t) k);
1153                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1154                                         out_buffer_full -= k;
1155                                 }
1156                         }
1157                 }
1158         }
1159
1160 finish:
1161         if (ep >= 0)
1162                 close_nointr_nofail(ep);
1163
1164         if (signal_fd >= 0)
1165                 close_nointr_nofail(signal_fd);
1166
1167         return r;
1168 }
1169
1170 int main(int argc, char *argv[]) {
1171         pid_t pid = 0;
1172         int r = EXIT_FAILURE, k;
1173         _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1174         _cleanup_close_ int master = -1;
1175         int n_fd_passed;
1176         const char *console = NULL;
1177         struct termios saved_attr, raw_attr;
1178         sigset_t mask;
1179         bool saved_attr_valid = false;
1180         struct winsize ws;
1181         int kmsg_socket_pair[2] = { -1, -1 };
1182         FDSet *fds = NULL;
1183
1184         log_parse_environment();
1185         log_open();
1186
1187         r = parse_argv(argc, argv);
1188         if (r <= 0)
1189                 goto finish;
1190
1191         if (arg_directory) {
1192                 char *p;
1193
1194                 p = path_make_absolute_cwd(arg_directory);
1195                 free(arg_directory);
1196                 arg_directory = p;
1197         } else
1198                 arg_directory = get_current_dir_name();
1199
1200         if (!arg_directory) {
1201                 log_error("Failed to determine path");
1202                 goto finish;
1203         }
1204
1205         path_kill_slashes(arg_directory);
1206
1207         if (!arg_machine) {
1208                 arg_machine = strdup(path_get_file_name(arg_directory));
1209                 if (!arg_machine) {
1210                         log_oom();
1211                         goto finish;
1212                 }
1213
1214                 hostname_cleanup(arg_machine);
1215                 if (isempty(arg_machine)) {
1216                         log_error("Failed to determine machine name automatically, please use -M.");
1217                         goto finish;
1218                 }
1219         }
1220
1221         if (geteuid() != 0) {
1222                 log_error("Need to be root.");
1223                 goto finish;
1224         }
1225
1226         if (sd_booted() <= 0) {
1227                 log_error("Not running on a systemd system.");
1228                 goto finish;
1229         }
1230
1231         if (path_equal(arg_directory, "/")) {
1232                 log_error("Spawning container on root directory not supported.");
1233                 goto finish;
1234         }
1235
1236         if (path_is_os_tree(arg_directory) <= 0) {
1237                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1238                 goto finish;
1239         }
1240
1241         log_close();
1242         n_fd_passed = sd_listen_fds(false);
1243         if (n_fd_passed > 0) {
1244                 k = fdset_new_listen_fds(&fds, false);
1245                 if (k < 0) {
1246                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1247                         goto finish;
1248                 }
1249         }
1250         fdset_close_others(fds);
1251         log_open();
1252
1253         k = cg_get_machine_path(&machine_root);
1254         if (k < 0) {
1255                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1256                 goto finish;
1257         }
1258
1259         newcg = strjoin(machine_root, "/", arg_machine, NULL);
1260         if (!newcg) {
1261                 log_error("Failed to allocate cgroup path.");
1262                 goto finish;
1263         }
1264
1265         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1266         if (r <= 0 && r != -ENOENT) {
1267                 log_error("Container already running.");
1268
1269                 free(newcg);
1270                 newcg = NULL;
1271
1272                 goto finish;
1273         }
1274
1275         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1276         if (master < 0) {
1277                 log_error("Failed to acquire pseudo tty: %m");
1278                 goto finish;
1279         }
1280
1281         console = ptsname(master);
1282         if (!console) {
1283                 log_error("Failed to determine tty name: %m");
1284                 goto finish;
1285         }
1286
1287         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1288
1289         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1290                 ioctl(master, TIOCSWINSZ, &ws);
1291
1292         if (unlockpt(master) < 0) {
1293                 log_error("Failed to unlock tty: %m");
1294                 goto finish;
1295         }
1296
1297         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1298                 saved_attr_valid = true;
1299
1300                 raw_attr = saved_attr;
1301                 cfmakeraw(&raw_attr);
1302                 raw_attr.c_lflag &= ~ECHO;
1303         }
1304
1305         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1306                 log_error("Failed to create kmsg socket pair.");
1307                 goto finish;
1308         }
1309
1310         assert_se(sigemptyset(&mask) == 0);
1311         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1312         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1313
1314         for (;;) {
1315                 siginfo_t status;
1316                 int pipefd[2];
1317
1318                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1319                         log_error("pipe2(): %m");
1320                         goto finish;
1321                 }
1322
1323                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1324                 if (pid < 0) {
1325                         if (errno == EINVAL)
1326                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1327                         else
1328                                 log_error("clone() failed: %m");
1329
1330                         goto finish;
1331                 }
1332
1333                 if (pid == 0) {
1334                         /* child */
1335                         const char *home = NULL;
1336                         uid_t uid = (uid_t) -1;
1337                         gid_t gid = (gid_t) -1;
1338                         unsigned n_env = 2;
1339                         const char *envp[] = {
1340                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1341                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1342                                 NULL, /* TERM */
1343                                 NULL, /* HOME */
1344                                 NULL, /* USER */
1345                                 NULL, /* LOGNAME */
1346                                 NULL, /* container_uuid */
1347                                 NULL, /* LISTEN_FDS */
1348                                 NULL, /* LISTEN_PID */
1349                                 NULL
1350                         };
1351
1352                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1353                         if (envp[n_env])
1354                                 n_env ++;
1355
1356                         close_nointr_nofail(pipefd[1]);
1357                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1358                         close_nointr_nofail(pipefd[0]);
1359
1360                         close_nointr_nofail(master);
1361                         master = -1;
1362
1363                         if (saved_attr_valid) {
1364                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1365                                         log_error("Failed to set terminal attributes: %m");
1366                                         goto child_fail;
1367                                 }
1368                         }
1369
1370                         close_nointr(STDIN_FILENO);
1371                         close_nointr(STDOUT_FILENO);
1372                         close_nointr(STDERR_FILENO);
1373
1374                         close_nointr_nofail(kmsg_socket_pair[0]);
1375                         kmsg_socket_pair[0] = -1;
1376
1377                         reset_all_signal_handlers();
1378
1379                         assert_se(sigemptyset(&mask) == 0);
1380                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1381
1382                         k = open_terminal(console, O_RDWR);
1383                         if (k != STDIN_FILENO) {
1384                                 if (k >= 0) {
1385                                         close_nointr_nofail(k);
1386                                         k = -EINVAL;
1387                                 }
1388
1389                                 log_error("Failed to open console: %s", strerror(-k));
1390                                 goto child_fail;
1391                         }
1392
1393                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1394                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1395                                 log_error("Failed to duplicate console: %m");
1396                                 goto child_fail;
1397                         }
1398
1399                         if (setsid() < 0) {
1400                                 log_error("setsid() failed: %m");
1401                                 goto child_fail;
1402                         }
1403
1404                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1405                                 log_error("PR_SET_PDEATHSIG failed: %m");
1406                                 goto child_fail;
1407                         }
1408
1409                         if (setup_cgroup(newcg) < 0)
1410                                 goto child_fail;
1411
1412                         /* Mark everything as slave, so that we still
1413                          * receive mounts from the real root, but don't
1414                          * propagate mounts to the real root. */
1415                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1416                                 log_error("MS_SLAVE|MS_REC failed: %m");
1417                                 goto child_fail;
1418                         }
1419
1420                         /* Turn directory into bind mount */
1421                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1422                                 log_error("Failed to make bind mount.");
1423                                 goto child_fail;
1424                         }
1425
1426                         if (arg_read_only)
1427                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1428                                         log_error("Failed to make read-only.");
1429                                         goto child_fail;
1430                                 }
1431
1432                         if (mount_all(arg_directory) < 0)
1433                                 goto child_fail;
1434
1435                         if (copy_devnodes(arg_directory) < 0)
1436                                 goto child_fail;
1437
1438                         if (setup_ptmx(arg_directory) < 0)
1439                                 goto child_fail;
1440
1441                         dev_setup(arg_directory);
1442
1443                         if (setup_dev_console(arg_directory, console) < 0)
1444                                 goto child_fail;
1445
1446                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1447                                 goto child_fail;
1448
1449                         close_nointr_nofail(kmsg_socket_pair[1]);
1450                         kmsg_socket_pair[1] = -1;
1451
1452                         if (setup_boot_id(arg_directory) < 0)
1453                                 goto child_fail;
1454
1455                         if (setup_timezone(arg_directory) < 0)
1456                                 goto child_fail;
1457
1458                         if (setup_resolv_conf(arg_directory) < 0)
1459                                 goto child_fail;
1460
1461                         if (setup_journal(arg_directory) < 0)
1462                                 goto child_fail;
1463
1464                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1465                                 goto child_fail;
1466
1467                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1468                                 goto child_fail;
1469
1470                         if (chdir(arg_directory) < 0) {
1471                                 log_error("chdir(%s) failed: %m", arg_directory);
1472                                 goto child_fail;
1473                         }
1474
1475                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1476                                 log_error("mount(MS_MOVE) failed: %m");
1477                                 goto child_fail;
1478                         }
1479
1480                         if (chroot(".") < 0) {
1481                                 log_error("chroot() failed: %m");
1482                                 goto child_fail;
1483                         }
1484
1485                         if (chdir("/") < 0) {
1486                                 log_error("chdir() failed: %m");
1487                                 goto child_fail;
1488                         }
1489
1490                         umask(0022);
1491
1492                         loopback_setup();
1493
1494                         if (drop_capabilities() < 0) {
1495                                 log_error("drop_capabilities() failed: %m");
1496                                 goto child_fail;
1497                         }
1498
1499                         if (arg_user) {
1500
1501                                 /* Note that this resolves user names
1502                                  * inside the container, and hence
1503                                  * accesses the NSS modules from the
1504                                  * container and not the host. This is
1505                                  * a bit weird... */
1506
1507                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1508                                         log_error("get_user_creds() failed: %m");
1509                                         goto child_fail;
1510                                 }
1511
1512                                 if (mkdir_parents_label(home, 0775) < 0) {
1513                                         log_error("mkdir_parents_label() failed: %m");
1514                                         goto child_fail;
1515                                 }
1516
1517                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1518                                         log_error("mkdir_safe_label() failed: %m");
1519                                         goto child_fail;
1520                                 }
1521
1522                                 if (initgroups((const char*)arg_user, gid) < 0) {
1523                                         log_error("initgroups() failed: %m");
1524                                         goto child_fail;
1525                                 }
1526
1527                                 if (setresgid(gid, gid, gid) < 0) {
1528                                         log_error("setregid() failed: %m");
1529                                         goto child_fail;
1530                                 }
1531
1532                                 if (setresuid(uid, uid, uid) < 0) {
1533                                         log_error("setreuid() failed: %m");
1534                                         goto child_fail;
1535                                 }
1536                         } else {
1537                                 /* Reset everything fully to 0, just in case */
1538
1539                                 if (setgroups(0, NULL) < 0) {
1540                                         log_error("setgroups() failed: %m");
1541                                         goto child_fail;
1542                                 }
1543
1544                                 if (setresgid(0, 0, 0) < 0) {
1545                                         log_error("setregid() failed: %m");
1546                                         goto child_fail;
1547                                 }
1548
1549                                 if (setresuid(0, 0, 0) < 0) {
1550                                         log_error("setreuid() failed: %m");
1551                                         goto child_fail;
1552                                 }
1553                         }
1554
1555                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1556                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1557                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1558                                 log_oom();
1559                                 goto child_fail;
1560                         }
1561
1562                         if (arg_uuid) {
1563                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1564                                         log_oom();
1565                                         goto child_fail;
1566                                 }
1567                         }
1568
1569                         if (fdset_size(fds) > 0) {
1570                                 k = fdset_cloexec(fds, false);
1571                                 if (k < 0) {
1572                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1573                                         goto child_fail;
1574                                 }
1575
1576                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1577                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1578                                         log_oom();
1579                                         goto child_fail;
1580                                 }
1581                         }
1582
1583                         setup_hostname();
1584
1585                         if (arg_boot) {
1586                                 char **a;
1587                                 size_t l;
1588
1589                                 /* Automatically search for the init system */
1590
1591                                 l = 1 + argc - optind;
1592                                 a = newa(char*, l + 1);
1593                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1594
1595                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1596                                 execve(a[0], a, (char**) envp);
1597
1598                                 a[0] = (char*) "/lib/systemd/systemd";
1599                                 execve(a[0], a, (char**) envp);
1600
1601                                 a[0] = (char*) "/sbin/init";
1602                                 execve(a[0], a, (char**) envp);
1603                         } else if (argc > optind)
1604                                 execvpe(argv[optind], argv + optind, (char**) envp);
1605                         else {
1606                                 chdir(home ? home : "/root");
1607                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1608                         }
1609
1610                         log_error("execv() failed: %m");
1611
1612                 child_fail:
1613                         _exit(EXIT_FAILURE);
1614                 }
1615
1616                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1617                 close_nointr_nofail(pipefd[0]);
1618                 close_nointr_nofail(pipefd[1]);
1619
1620                 fdset_free(fds);
1621                 fds = NULL;
1622
1623                 if (process_pty(master, pid, &mask) < 0)
1624                         goto finish;
1625
1626                 if (saved_attr_valid)
1627                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1628
1629                 r = wait_for_terminate(pid, &status);
1630                 if (r < 0) {
1631                         r = EXIT_FAILURE;
1632                         break;
1633                 }
1634
1635                 if (status.si_code == CLD_EXITED) {
1636                         if (status.si_status != 0) {
1637                                 log_error("Container failed with error code %i.", status.si_status);
1638                                 r = status.si_status;
1639                                 break;
1640                         }
1641
1642                         log_debug("Container exited successfully.");
1643                         break;
1644                 } else if (status.si_code == CLD_KILLED &&
1645                            status.si_status == SIGINT) {
1646                         log_info("Container has been shut down.");
1647                         r = 0;
1648                         break;
1649                 } else if (status.si_code == CLD_KILLED &&
1650                            status.si_status == SIGHUP) {
1651                         log_info("Container is being rebooted.");
1652                         continue;
1653                 } else if (status.si_code == CLD_KILLED ||
1654                            status.si_code == CLD_DUMPED) {
1655
1656                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1657                         r = EXIT_FAILURE;
1658                         break;
1659                 } else {
1660                         log_error("Container failed due to unknown reason.");
1661                         r = EXIT_FAILURE;
1662                         break;
1663                 }
1664         }
1665
1666 finish:
1667         if (saved_attr_valid)
1668                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1669
1670         close_pipe(kmsg_socket_pair);
1671
1672         if (newcg)
1673                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1674
1675         free(arg_directory);
1676         free(arg_machine);
1677         strv_free(arg_controllers);
1678
1679         fdset_free(fds);
1680
1681         return r;
1682 }