chiark / gitweb /
nspawn: introduce the new /machine/ tree in the cgroup tree and move containers there
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 #ifndef TTY_GID
64 #define TTY_GID 5
65 #endif
66
67 typedef enum LinkJournal {
68         LINK_NO,
69         LINK_AUTO,
70         LINK_HOST,
71         LINK_GUEST
72 } LinkJournal;
73
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static char *arg_machine = NULL;
79 static bool arg_private_network = false;
80 static bool arg_read_only = false;
81 static bool arg_boot = false;
82 static LinkJournal arg_link_journal = LINK_AUTO;
83 static uint64_t arg_retain =
84         (1ULL << CAP_CHOWN) |
85         (1ULL << CAP_DAC_OVERRIDE) |
86         (1ULL << CAP_DAC_READ_SEARCH) |
87         (1ULL << CAP_FOWNER) |
88         (1ULL << CAP_FSETID) |
89         (1ULL << CAP_IPC_OWNER) |
90         (1ULL << CAP_KILL) |
91         (1ULL << CAP_LEASE) |
92         (1ULL << CAP_LINUX_IMMUTABLE) |
93         (1ULL << CAP_NET_BIND_SERVICE) |
94         (1ULL << CAP_NET_BROADCAST) |
95         (1ULL << CAP_NET_RAW) |
96         (1ULL << CAP_SETGID) |
97         (1ULL << CAP_SETFCAP) |
98         (1ULL << CAP_SETPCAP) |
99         (1ULL << CAP_SETUID) |
100         (1ULL << CAP_SYS_ADMIN) |
101         (1ULL << CAP_SYS_CHROOT) |
102         (1ULL << CAP_SYS_NICE) |
103         (1ULL << CAP_SYS_PTRACE) |
104         (1ULL << CAP_SYS_TTY_CONFIG) |
105         (1ULL << CAP_SYS_RESOURCE) |
106         (1ULL << CAP_SYS_BOOT) |
107         (1ULL << CAP_AUDIT_WRITE) |
108         (1ULL << CAP_AUDIT_CONTROL);
109 static char **arg_bind = NULL;
110 static char **arg_bind_ro = NULL;
111
112 static int help(void) {
113
114         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
115                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
116                "  -h --help                Show this help\n"
117                "     --version             Print version string\n"
118                "  -D --directory=NAME      Root directory for the container\n"
119                "  -b --boot                Boot up full system (i.e. invoke init)\n"
120                "  -u --user=USER           Run the command under specified user or uid\n"
121                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
122                "                           cgroup hierarchies\n"
123                "     --uuid=UUID           Set a specific machine UUID for the container\n"
124                "  -M --machine=NAME        Set the machine name for the container\n"
125                "     --private-network     Disable network in container\n"
126                "     --read-only           Mount the root directory read-only\n"
127                "     --capability=CAP      In addition to the default, retain specified\n"
128                "                           capability\n"
129                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
130                "  -j                       Equivalent to --link-journal=host\n"
131                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
132                "                           the container\n"
133                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
134                program_invocation_short_name);
135
136         return 0;
137 }
138
139 static int parse_argv(int argc, char *argv[]) {
140
141         enum {
142                 ARG_VERSION = 0x100,
143                 ARG_PRIVATE_NETWORK,
144                 ARG_UUID,
145                 ARG_READ_ONLY,
146                 ARG_CAPABILITY,
147                 ARG_LINK_JOURNAL,
148                 ARG_BIND,
149                 ARG_BIND_RO
150         };
151
152         static const struct option options[] = {
153                 { "help",            no_argument,       NULL, 'h'                 },
154                 { "version",         no_argument,       NULL, ARG_VERSION         },
155                 { "directory",       required_argument, NULL, 'D'                 },
156                 { "user",            required_argument, NULL, 'u'                 },
157                 { "controllers",     required_argument, NULL, 'C'                 },
158                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
159                 { "boot",            no_argument,       NULL, 'b'                 },
160                 { "uuid",            required_argument, NULL, ARG_UUID            },
161                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
162                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
163                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
164                 { "bind",            required_argument, NULL, ARG_BIND            },
165                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
166                 { "machine",         required_argument, NULL, 'M'                 },
167                 { NULL,              0,                 NULL, 0                   }
168         };
169
170         int c;
171
172         assert(argc >= 0);
173         assert(argv);
174
175         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
176
177                 switch (c) {
178
179                 case 'h':
180                         help();
181                         return 0;
182
183                 case ARG_VERSION:
184                         puts(PACKAGE_STRING);
185                         puts(SYSTEMD_FEATURES);
186                         return 0;
187
188                 case 'D':
189                         free(arg_directory);
190                         arg_directory = canonicalize_file_name(optarg);
191                         if (!arg_directory) {
192                                 log_error("Failed to canonicalize root directory.");
193                                 return -ENOMEM;
194                         }
195
196                         break;
197
198                 case 'u':
199                         free(arg_user);
200                         arg_user = strdup(optarg);
201                         if (!arg_user)
202                                 return log_oom();
203
204                         break;
205
206                 case 'C':
207                         strv_free(arg_controllers);
208                         arg_controllers = strv_split(optarg, ",");
209                         if (!arg_controllers)
210                                 return log_oom();
211
212                         cg_shorten_controllers(arg_controllers);
213                         break;
214
215                 case ARG_PRIVATE_NETWORK:
216                         arg_private_network = true;
217                         break;
218
219                 case 'b':
220                         arg_boot = true;
221                         break;
222
223                 case ARG_UUID:
224                         arg_uuid = optarg;
225                         break;
226
227                 case 'M':
228                         if (!hostname_is_valid(optarg)) {
229                                 log_error("Invalid machine name: %s", optarg);
230                                 return -EINVAL;
231                         }
232
233                         free(arg_machine);
234                         arg_machine = strdup(optarg);
235                         if (!arg_machine)
236                                 return log_oom();
237
238                         break;
239
240                 case ARG_READ_ONLY:
241                         arg_read_only = true;
242                         break;
243
244                 case ARG_CAPABILITY: {
245                         char *state, *word;
246                         size_t length;
247
248                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
249                                 cap_value_t cap;
250                                 char *t;
251
252                                 t = strndup(word, length);
253                                 if (!t)
254                                         return log_oom();
255
256                                 if (cap_from_name(t, &cap) < 0) {
257                                         log_error("Failed to parse capability %s.", t);
258                                         free(t);
259                                         return -EINVAL;
260                                 }
261
262                                 free(t);
263                                 arg_retain |= 1ULL << (uint64_t) cap;
264                         }
265
266                         break;
267                 }
268
269                 case 'j':
270                         arg_link_journal = LINK_GUEST;
271                         break;
272
273                 case ARG_LINK_JOURNAL:
274                         if (streq(optarg, "auto"))
275                                 arg_link_journal = LINK_AUTO;
276                         else if (streq(optarg, "no"))
277                                 arg_link_journal = LINK_NO;
278                         else if (streq(optarg, "guest"))
279                                 arg_link_journal = LINK_GUEST;
280                         else if (streq(optarg, "host"))
281                                 arg_link_journal = LINK_HOST;
282                         else {
283                                 log_error("Failed to parse link journal mode %s", optarg);
284                                 return -EINVAL;
285                         }
286
287                         break;
288
289                 case ARG_BIND:
290                 case ARG_BIND_RO: {
291                         _cleanup_free_ char *a = NULL, *b = NULL;
292                         char *e;
293                         char ***x;
294                         int r;
295
296                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
297
298                         e = strchr(optarg, ':');
299                         if (e) {
300                                 a = strndup(optarg, e - optarg);
301                                 b = strdup(e + 1);
302                         } else {
303                                 a = strdup(optarg);
304                                 b = strdup(optarg);
305                         }
306
307                         if (!a || !b)
308                                 return log_oom();
309
310                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
311                                 log_error("Invalid bind mount specification: %s", optarg);
312                                 return -EINVAL;
313                         }
314
315                         r = strv_extend(x, a);
316                         if (r < 0)
317                                 return r;
318
319                         r = strv_extend(x, b);
320                         if (r < 0)
321                                 return r;
322
323                         break;
324                 }
325
326                 case '?':
327                         return -EINVAL;
328
329                 default:
330                         log_error("Unknown option code %c", c);
331                         return -EINVAL;
332                 }
333         }
334
335         return 1;
336 }
337
338 static int mount_all(const char *dest) {
339
340         typedef struct MountPoint {
341                 const char *what;
342                 const char *where;
343                 const char *type;
344                 const char *options;
345                 unsigned long flags;
346                 bool fatal;
347         } MountPoint;
348
349         static const MountPoint mount_table[] = {
350                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
351                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
352                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
353                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
354                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
355                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
357                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358 #ifdef HAVE_SELINUX
359                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
360                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
361 #endif
362         };
363
364         unsigned k;
365         int r = 0;
366
367         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368                 char _cleanup_free_ *where = NULL;
369                 int t;
370
371                 where = strjoin(dest, "/", mount_table[k].where, NULL);
372                 if (!where)
373                         return log_oom();
374
375                 t = path_is_mount_point(where, true);
376                 if (t < 0) {
377                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
378
379                         if (r == 0)
380                                 r = t;
381
382                         continue;
383                 }
384
385                 /* Skip this entry if it is not a remount. */
386                 if (mount_table[k].what && t > 0)
387                         continue;
388
389                 mkdir_p(where, 0755);
390
391                 if (mount(mount_table[k].what,
392                           where,
393                           mount_table[k].type,
394                           mount_table[k].flags,
395                           mount_table[k].options) < 0 &&
396                     mount_table[k].fatal) {
397
398                         log_error("mount(%s) failed: %m", where);
399
400                         if (r == 0)
401                                 r = -errno;
402                 }
403         }
404
405         return r;
406 }
407
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
409         char **x, **y;
410
411         STRV_FOREACH_PAIR(x, y, l) {
412                 _cleanup_free_ char *where = NULL;
413
414                 where = strjoin(dest, "/", *y, NULL);
415                 if (!where)
416                         return log_oom();
417
418                 mkdir_p_label(where, 0755);
419
420                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
421                         log_error("mount(%s) failed: %m", where);
422                         return -errno;
423                 }
424
425                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
426                         log_error("mount(%s) failed: %m", where);
427                         return -errno;
428                 }
429         }
430
431         return 0;
432 }
433
434 static int setup_timezone(const char *dest) {
435         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
436         char *z, *y;
437         int r;
438
439         assert(dest);
440
441         /* Fix the timezone, if possible */
442         r = readlink_malloc("/etc/localtime", &p);
443         if (r < 0) {
444                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
445                 return 0;
446         }
447
448         z = path_startswith(p, "../usr/share/zoneinfo/");
449         if (!z)
450                 z = path_startswith(p, "/usr/share/zoneinfo/");
451         if (!z) {
452                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
453                 return 0;
454         }
455
456         where = strappend(dest, "/etc/localtime");
457         if (!where)
458                 return log_oom();
459
460         r = readlink_malloc(where, &q);
461         if (r >= 0) {
462                 y = path_startswith(q, "../usr/share/zoneinfo/");
463                 if (!y)
464                         y = path_startswith(q, "/usr/share/zoneinfo/");
465
466
467                 /* Already pointing to the right place? Then do nothing .. */
468                 if (y && streq(y, z))
469                         return 0;
470         }
471
472         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
473         if (!check)
474                 return log_oom();
475
476         if (access(check, F_OK) < 0) {
477                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
478                 return 0;
479         }
480
481         what = strappend("../usr/share/zoneinfo/", z);
482         if (!what)
483                 return log_oom();
484
485         unlink(where);
486         if (symlink(what, where) < 0) {
487                 log_error("Failed to correct timezone of container: %m");
488                 return 0;
489         }
490
491         return 0;
492 }
493
494 static int setup_resolv_conf(const char *dest) {
495         char *where;
496
497         assert(dest);
498
499         if (arg_private_network)
500                 return 0;
501
502         /* Fix resolv.conf, if possible */
503         where = strappend(dest, "/etc/resolv.conf");
504         if (!where)
505                 return log_oom();
506
507         /* We don't really care for the results of this really. If it
508          * fails, it fails, but meh... */
509         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
510                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
511
512         free(where);
513
514         return 0;
515 }
516
517 static int setup_boot_id(const char *dest) {
518         char _cleanup_free_ *from = NULL, *to = NULL;
519         sd_id128_t rnd;
520         char as_uuid[37];
521         int r;
522
523         assert(dest);
524
525         /* Generate a new randomized boot ID, so that each boot-up of
526          * the container gets a new one */
527
528         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
529         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
530         if (!from || !to)
531                 return log_oom();
532
533         r = sd_id128_randomize(&rnd);
534         if (r < 0) {
535                 log_error("Failed to generate random boot id: %s", strerror(-r));
536                 return r;
537         }
538
539         snprintf(as_uuid, sizeof(as_uuid),
540                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
541                  SD_ID128_FORMAT_VAL(rnd));
542         char_array_0(as_uuid);
543
544         r = write_string_file(from, as_uuid);
545         if (r < 0) {
546                 log_error("Failed to write boot id: %s", strerror(-r));
547                 return r;
548         }
549
550         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
551                 log_error("Failed to bind mount boot id: %m");
552                 r = -errno;
553         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
554                 log_warning("Failed to make boot id read-only: %m");
555
556         unlink(from);
557         return r;
558 }
559
560 static int copy_devnodes(const char *dest) {
561
562         static const char devnodes[] =
563                 "null\0"
564                 "zero\0"
565                 "full\0"
566                 "random\0"
567                 "urandom\0"
568                 "tty\0";
569
570         const char *d;
571         int r = 0;
572         mode_t _cleanup_umask_ u;
573
574         assert(dest);
575
576         u = umask(0000);
577
578         NULSTR_FOREACH(d, devnodes) {
579                 struct stat st;
580                 char _cleanup_free_ *from = NULL, *to = NULL;
581
582                 asprintf(&from, "/dev/%s", d);
583                 asprintf(&to, "%s/dev/%s", dest, d);
584
585                 if (!from || !to) {
586                         log_oom();
587
588                         if (r == 0)
589                                 r = -ENOMEM;
590
591                         break;
592                 }
593
594                 if (stat(from, &st) < 0) {
595
596                         if (errno != ENOENT) {
597                                 log_error("Failed to stat %s: %m", from);
598                                 if (r == 0)
599                                         r = -errno;
600                         }
601
602                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
603
604                         log_error("%s is not a char or block device, cannot copy", from);
605                         if (r == 0)
606                                 r = -EIO;
607
608                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
609
610                         log_error("mknod(%s) failed: %m", dest);
611                         if (r == 0)
612                                 r = -errno;
613                 }
614         }
615
616         return r;
617 }
618
619 static int setup_ptmx(const char *dest) {
620         _cleanup_free_ char *p = NULL;
621
622         p = strappend(dest, "/dev/ptmx");
623         if (!p)
624                 return log_oom();
625
626         if (symlink("pts/ptmx", p) < 0) {
627                 log_error("Failed to create /dev/ptmx symlink: %m");
628                 return -errno;
629         }
630
631         return 0;
632 }
633
634 static int setup_dev_console(const char *dest, const char *console) {
635         struct stat st;
636         char _cleanup_free_ *to = NULL;
637         int r;
638         mode_t _cleanup_umask_ u;
639
640         assert(dest);
641         assert(console);
642
643         u = umask(0000);
644
645         if (stat(console, &st) < 0) {
646                 log_error("Failed to stat %s: %m", console);
647                 return -errno;
648
649         } else if (!S_ISCHR(st.st_mode)) {
650                 log_error("/dev/console is not a char device");
651                 return -EIO;
652         }
653
654         r = chmod_and_chown(console, 0600, 0, 0);
655         if (r < 0) {
656                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
657                 return r;
658         }
659
660         if (asprintf(&to, "%s/dev/console", dest) < 0)
661                 return log_oom();
662
663         /* We need to bind mount the right tty to /dev/console since
664          * ptys can only exist on pts file systems. To have something
665          * to bind mount things on we create a device node first, that
666          * has the right major/minor (note that the major minor
667          * doesn't actually matter here, since we mount it over
668          * anyway). */
669
670         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
671                 log_error("mknod() for /dev/console failed: %m");
672                 return -errno;
673         }
674
675         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
676                 log_error("Bind mount for /dev/console failed: %m");
677                 return -errno;
678         }
679
680         return 0;
681 }
682
683 static int setup_kmsg(const char *dest, int kmsg_socket) {
684         char _cleanup_free_ *from = NULL, *to = NULL;
685         int r, fd, k;
686         mode_t _cleanup_umask_ u;
687         union {
688                 struct cmsghdr cmsghdr;
689                 uint8_t buf[CMSG_SPACE(sizeof(int))];
690         } control = {};
691         struct msghdr mh = {
692                 .msg_control = &control,
693                 .msg_controllen = sizeof(control),
694         };
695         struct cmsghdr *cmsg;
696
697         assert(dest);
698         assert(kmsg_socket >= 0);
699
700         u = umask(0000);
701
702         /* We create the kmsg FIFO as /dev/kmsg, but immediately
703          * delete it after bind mounting it to /proc/kmsg. While FIFOs
704          * on the reading side behave very similar to /proc/kmsg,
705          * their writing side behaves differently from /dev/kmsg in
706          * that writing blocks when nothing is reading. In order to
707          * avoid any problems with containers deadlocking due to this
708          * we simply make /dev/kmsg unavailable to the container. */
709         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
710             asprintf(&to, "%s/proc/kmsg", dest) < 0)
711                 return log_oom();
712
713         if (mkfifo(from, 0600) < 0) {
714                 log_error("mkfifo() for /dev/kmsg failed: %m");
715                 return -errno;
716         }
717
718         r = chmod_and_chown(from, 0600, 0, 0);
719         if (r < 0) {
720                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
721                 return r;
722         }
723
724         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
725                 log_error("Bind mount for /proc/kmsg failed: %m");
726                 return -errno;
727         }
728
729         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
730         if (fd < 0) {
731                 log_error("Failed to open fifo: %m");
732                 return -errno;
733         }
734
735         cmsg = CMSG_FIRSTHDR(&mh);
736         cmsg->cmsg_level = SOL_SOCKET;
737         cmsg->cmsg_type = SCM_RIGHTS;
738         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
739         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
740
741         mh.msg_controllen = cmsg->cmsg_len;
742
743         /* Store away the fd in the socket, so that it stays open as
744          * long as we run the child */
745         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
746         close_nointr_nofail(fd);
747
748         if (k < 0) {
749                 log_error("Failed to send FIFO fd: %m");
750                 return -errno;
751         }
752
753         /* And now make the FIFO unavailable as /dev/kmsg... */
754         unlink(from);
755         return 0;
756 }
757
758 static int setup_hostname(void) {
759
760         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
761                 return -errno;
762
763         return 0;
764 }
765
766 static int setup_journal(const char *directory) {
767         sd_id128_t machine_id;
768         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
769         char *id;
770         int r;
771
772         if (arg_link_journal == LINK_NO)
773                 return 0;
774
775         p = strappend(directory, "/etc/machine-id");
776         if (!p)
777                 return log_oom();
778
779         r = read_one_line_file(p, &b);
780         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
781                 return 0;
782         else if (r < 0) {
783                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
784                 return r;
785         }
786
787         id = strstrip(b);
788         if (isempty(id) && arg_link_journal == LINK_AUTO)
789                 return 0;
790
791         /* Verify validity */
792         r = sd_id128_from_string(id, &machine_id);
793         if (r < 0) {
794                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
795                 return r;
796         }
797
798         free(p);
799         p = strappend("/var/log/journal/", id);
800         q = strjoin(directory, "/var/log/journal/", id, NULL);
801         if (!p || !q)
802                 return log_oom();
803
804         if (path_is_mount_point(p, false) > 0) {
805                 if (arg_link_journal != LINK_AUTO) {
806                         log_error("%s: already a mount point, refusing to use for journal", p);
807                         return -EEXIST;
808                 }
809
810                 return 0;
811         }
812
813         if (path_is_mount_point(q, false) > 0) {
814                 if (arg_link_journal != LINK_AUTO) {
815                         log_error("%s: already a mount point, refusing to use for journal", q);
816                         return -EEXIST;
817                 }
818
819                 return 0;
820         }
821
822         r = readlink_and_make_absolute(p, &d);
823         if (r >= 0) {
824                 if ((arg_link_journal == LINK_GUEST ||
825                      arg_link_journal == LINK_AUTO) &&
826                     path_equal(d, q)) {
827
828                         r = mkdir_p(q, 0755);
829                         if (r < 0)
830                                 log_warning("failed to create directory %s: %m", q);
831                         return 0;
832                 }
833
834                 if (unlink(p) < 0) {
835                         log_error("Failed to remove symlink %s: %m", p);
836                         return -errno;
837                 }
838         } else if (r == -EINVAL) {
839
840                 if (arg_link_journal == LINK_GUEST &&
841                     rmdir(p) < 0) {
842
843                         if (errno == ENOTDIR) {
844                                 log_error("%s already exists and is neither a symlink nor a directory", p);
845                                 return r;
846                         } else {
847                                 log_error("Failed to remove %s: %m", p);
848                                 return -errno;
849                         }
850                 }
851         } else if (r != -ENOENT) {
852                 log_error("readlink(%s) failed: %m", p);
853                 return r;
854         }
855
856         if (arg_link_journal == LINK_GUEST) {
857
858                 if (symlink(q, p) < 0) {
859                         log_error("Failed to symlink %s to %s: %m", q, p);
860                         return -errno;
861                 }
862
863                 r = mkdir_p(q, 0755);
864                 if (r < 0)
865                         log_warning("failed to create directory %s: %m", q);
866                 return 0;
867         }
868
869         if (arg_link_journal == LINK_HOST) {
870                 r = mkdir_p(p, 0755);
871                 if (r < 0) {
872                         log_error("Failed to create %s: %m", p);
873                         return r;
874                 }
875
876         } else if (access(p, F_OK) < 0)
877                 return 0;
878
879         if (dir_is_empty(q) == 0) {
880                 log_error("%s not empty.", q);
881                 return -ENOTEMPTY;
882         }
883
884         r = mkdir_p(q, 0755);
885         if (r < 0) {
886                 log_error("Failed to create %s: %m", q);
887                 return r;
888         }
889
890         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
891                 log_error("Failed to bind mount journal from host into guest: %m");
892                 return -errno;
893         }
894
895         return 0;
896 }
897
898 static int setup_cgroup(const char *path) {
899         char **c;
900         int r;
901
902         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
903         if (r < 0) {
904                 log_error("Failed to create cgroup: %s", strerror(-r));
905                 return r;
906         }
907
908         STRV_FOREACH(c, arg_controllers) {
909                 r = cg_create_and_attach(*c, path, 1);
910                 if (r < 0)
911                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
912         }
913
914         return 0;
915 }
916
917 static int drop_capabilities(void) {
918         return capability_bounding_set_drop(~arg_retain, false);
919 }
920
921 static int is_os_tree(const char *path) {
922         int r;
923         char *p;
924         /* We use /bin/sh as flag file if something is an OS */
925
926         if (asprintf(&p, "%s/bin/sh", path) < 0)
927                 return -ENOMEM;
928
929         r = access(p, F_OK);
930         free(p);
931
932         return r < 0 ? 0 : 1;
933 }
934
935 static int process_pty(int master, pid_t pid, sigset_t *mask) {
936
937         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
938         size_t in_buffer_full = 0, out_buffer_full = 0;
939         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
940         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
941         int ep = -1, signal_fd = -1, r;
942         bool tried_orderly_shutdown = false;
943
944         assert(master >= 0);
945         assert(pid > 0);
946         assert(mask);
947
948         fd_nonblock(STDIN_FILENO, 1);
949         fd_nonblock(STDOUT_FILENO, 1);
950         fd_nonblock(master, 1);
951
952         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
953         if (signal_fd < 0) {
954                 log_error("signalfd(): %m");
955                 r = -errno;
956                 goto finish;
957         }
958
959         ep = epoll_create1(EPOLL_CLOEXEC);
960         if (ep < 0) {
961                 log_error("Failed to create epoll: %m");
962                 r = -errno;
963                 goto finish;
964         }
965
966         /* We read from STDIN only if this is actually a TTY,
967          * otherwise we assume non-interactivity. */
968         if (isatty(STDIN_FILENO)) {
969                 zero(stdin_ev);
970                 stdin_ev.events = EPOLLIN|EPOLLET;
971                 stdin_ev.data.fd = STDIN_FILENO;
972
973                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
974                         log_error("Failed to register STDIN in epoll: %m");
975                         r = -errno;
976                         goto finish;
977                 }
978         }
979
980         zero(stdout_ev);
981         stdout_ev.events = EPOLLOUT|EPOLLET;
982         stdout_ev.data.fd = STDOUT_FILENO;
983
984         zero(master_ev);
985         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
986         master_ev.data.fd = master;
987
988         zero(signal_ev);
989         signal_ev.events = EPOLLIN;
990         signal_ev.data.fd = signal_fd;
991
992         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
993                 if (errno != EPERM) {
994                         log_error("Failed to register stdout in epoll: %m");
995                         r = -errno;
996                         goto finish;
997                 }
998                 /* stdout without epoll support. Likely redirected to regular file. */
999                 stdout_writable = true;
1000         }
1001
1002         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1003             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1004                 log_error("Failed to register fds in epoll: %m");
1005                 r = -errno;
1006                 goto finish;
1007         }
1008
1009         for (;;) {
1010                 struct epoll_event ev[16];
1011                 ssize_t k;
1012                 int i, nfds;
1013
1014                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1015                 if (nfds < 0) {
1016
1017                         if (errno == EINTR || errno == EAGAIN)
1018                                 continue;
1019
1020                         log_error("epoll_wait(): %m");
1021                         r = -errno;
1022                         goto finish;
1023                 }
1024
1025                 assert(nfds >= 1);
1026
1027                 for (i = 0; i < nfds; i++) {
1028                         if (ev[i].data.fd == STDIN_FILENO) {
1029
1030                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1031                                         stdin_readable = true;
1032
1033                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1034
1035                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1036                                         stdout_writable = true;
1037
1038                         } else if (ev[i].data.fd == master) {
1039
1040                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1041                                         master_readable = true;
1042
1043                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1044                                         master_writable = true;
1045
1046                         } else if (ev[i].data.fd == signal_fd) {
1047                                 struct signalfd_siginfo sfsi;
1048                                 ssize_t n;
1049
1050                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1051                                 if (n != sizeof(sfsi)) {
1052
1053                                         if (n >= 0) {
1054                                                 log_error("Failed to read from signalfd: invalid block size");
1055                                                 r = -EIO;
1056                                                 goto finish;
1057                                         }
1058
1059                                         if (errno != EINTR && errno != EAGAIN) {
1060                                                 log_error("Failed to read from signalfd: %m");
1061                                                 r = -errno;
1062                                                 goto finish;
1063                                         }
1064                                 } else {
1065
1066                                         if (sfsi.ssi_signo == SIGWINCH) {
1067                                                 struct winsize ws;
1068
1069                                                 /* The window size changed, let's forward that. */
1070                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1071                                                         ioctl(master, TIOCSWINSZ, &ws);
1072                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1073
1074                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1075
1076                                                 /* This only works for systemd... */
1077                                                 tried_orderly_shutdown = true;
1078                                                 kill(pid, SIGRTMIN+3);
1079
1080                                         } else {
1081                                                 r = 0;
1082                                                 goto finish;
1083                                         }
1084                                 }
1085                         }
1086                 }
1087
1088                 while ((stdin_readable && in_buffer_full <= 0) ||
1089                        (master_writable && in_buffer_full > 0) ||
1090                        (master_readable && out_buffer_full <= 0) ||
1091                        (stdout_writable && out_buffer_full > 0)) {
1092
1093                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1094
1095                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1096                                 if (k < 0) {
1097
1098                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1099                                                 stdin_readable = false;
1100                                         else {
1101                                                 log_error("read(): %m");
1102                                                 r = -errno;
1103                                                 goto finish;
1104                                         }
1105                                 } else
1106                                         in_buffer_full += (size_t) k;
1107                         }
1108
1109                         if (master_writable && in_buffer_full > 0) {
1110
1111                                 k = write(master, in_buffer, in_buffer_full);
1112                                 if (k < 0) {
1113
1114                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1115                                                 master_writable = false;
1116                                         else {
1117                                                 log_error("write(): %m");
1118                                                 r = -errno;
1119                                                 goto finish;
1120                                         }
1121
1122                                 } else {
1123                                         assert(in_buffer_full >= (size_t) k);
1124                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1125                                         in_buffer_full -= k;
1126                                 }
1127                         }
1128
1129                         if (master_readable && out_buffer_full < LINE_MAX) {
1130
1131                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1132                                 if (k < 0) {
1133
1134                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1135                                                 master_readable = false;
1136                                         else {
1137                                                 log_error("read(): %m");
1138                                                 r = -errno;
1139                                                 goto finish;
1140                                         }
1141                                 }  else
1142                                         out_buffer_full += (size_t) k;
1143                         }
1144
1145                         if (stdout_writable && out_buffer_full > 0) {
1146
1147                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1148                                 if (k < 0) {
1149
1150                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1151                                                 stdout_writable = false;
1152                                         else {
1153                                                 log_error("write(): %m");
1154                                                 r = -errno;
1155                                                 goto finish;
1156                                         }
1157
1158                                 } else {
1159                                         assert(out_buffer_full >= (size_t) k);
1160                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1161                                         out_buffer_full -= k;
1162                                 }
1163                         }
1164                 }
1165         }
1166
1167 finish:
1168         if (ep >= 0)
1169                 close_nointr_nofail(ep);
1170
1171         if (signal_fd >= 0)
1172                 close_nointr_nofail(signal_fd);
1173
1174         return r;
1175 }
1176
1177 int main(int argc, char *argv[]) {
1178         pid_t pid = 0;
1179         int r = EXIT_FAILURE, k;
1180         _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1181         _cleanup_close_ int master = -1;
1182         int n_fd_passed;
1183         const char *console = NULL;
1184         struct termios saved_attr, raw_attr;
1185         sigset_t mask;
1186         bool saved_attr_valid = false;
1187         struct winsize ws;
1188         int kmsg_socket_pair[2] = { -1, -1 };
1189         FDSet *fds = NULL;
1190
1191         log_parse_environment();
1192         log_open();
1193
1194         r = parse_argv(argc, argv);
1195         if (r <= 0)
1196                 goto finish;
1197
1198         if (arg_directory) {
1199                 char *p;
1200
1201                 p = path_make_absolute_cwd(arg_directory);
1202                 free(arg_directory);
1203                 arg_directory = p;
1204         } else
1205                 arg_directory = get_current_dir_name();
1206
1207         if (!arg_directory) {
1208                 log_error("Failed to determine path");
1209                 goto finish;
1210         }
1211
1212         path_kill_slashes(arg_directory);
1213
1214         if (!arg_machine) {
1215                 arg_machine = strdup(path_get_file_name(arg_directory));
1216                 if (!arg_machine) {
1217                         log_oom();
1218                         goto finish;
1219                 }
1220
1221                 hostname_cleanup(arg_machine);
1222                 if (isempty(arg_machine)) {
1223                         log_error("Failed to determine machine name automatically, please use -M.");
1224                         goto finish;
1225                 }
1226         }
1227
1228         if (geteuid() != 0) {
1229                 log_error("Need to be root.");
1230                 goto finish;
1231         }
1232
1233         if (sd_booted() <= 0) {
1234                 log_error("Not running on a systemd system.");
1235                 goto finish;
1236         }
1237
1238         if (path_equal(arg_directory, "/")) {
1239                 log_error("Spawning container on root directory not supported.");
1240                 goto finish;
1241         }
1242
1243         if (is_os_tree(arg_directory) <= 0) {
1244                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1245                 goto finish;
1246         }
1247
1248         log_close();
1249         n_fd_passed = sd_listen_fds(false);
1250         if (n_fd_passed > 0) {
1251                 k = fdset_new_listen_fds(&fds, false);
1252                 if (k < 0) {
1253                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1254                         goto finish;
1255                 }
1256         }
1257         fdset_close_others(fds);
1258         log_open();
1259
1260         k = cg_get_machine_path(&machine_root);
1261         if (k < 0) {
1262                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1263                 goto finish;
1264         }
1265
1266         newcg = strjoin(machine_root, "/", arg_machine, NULL);
1267         if (!newcg) {
1268                 log_error("Failed to allocate cgroup path.");
1269                 goto finish;
1270         }
1271
1272         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1273         if (r <= 0 && r != -ENOENT) {
1274                 log_error("Container already running.");
1275
1276                 free(newcg);
1277                 newcg = NULL;
1278
1279                 goto finish;
1280         }
1281
1282         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1283         if (master < 0) {
1284                 log_error("Failed to acquire pseudo tty: %m");
1285                 goto finish;
1286         }
1287
1288         console = ptsname(master);
1289         if (!console) {
1290                 log_error("Failed to determine tty name: %m");
1291                 goto finish;
1292         }
1293
1294         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1295
1296         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1297                 ioctl(master, TIOCSWINSZ, &ws);
1298
1299         if (unlockpt(master) < 0) {
1300                 log_error("Failed to unlock tty: %m");
1301                 goto finish;
1302         }
1303
1304         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1305                 saved_attr_valid = true;
1306
1307                 raw_attr = saved_attr;
1308                 cfmakeraw(&raw_attr);
1309                 raw_attr.c_lflag &= ~ECHO;
1310         }
1311
1312         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1313                 log_error("Failed to create kmsg socket pair.");
1314                 goto finish;
1315         }
1316
1317         assert_se(sigemptyset(&mask) == 0);
1318         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1319         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1320
1321         for (;;) {
1322                 siginfo_t status;
1323                 int pipefd[2];
1324
1325                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1326                         log_error("pipe2(): %m");
1327                         goto finish;
1328                 }
1329
1330                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1331                 if (pid < 0) {
1332                         if (errno == EINVAL)
1333                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1334                         else
1335                                 log_error("clone() failed: %m");
1336
1337                         goto finish;
1338                 }
1339
1340                 if (pid == 0) {
1341                         /* child */
1342                         const char *home = NULL;
1343                         uid_t uid = (uid_t) -1;
1344                         gid_t gid = (gid_t) -1;
1345                         unsigned n_env = 2;
1346                         const char *envp[] = {
1347                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1348                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1349                                 NULL, /* TERM */
1350                                 NULL, /* HOME */
1351                                 NULL, /* USER */
1352                                 NULL, /* LOGNAME */
1353                                 NULL, /* container_uuid */
1354                                 NULL, /* LISTEN_FDS */
1355                                 NULL, /* LISTEN_PID */
1356                                 NULL
1357                         };
1358
1359                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1360                         if (envp[n_env])
1361                                 n_env ++;
1362
1363                         close_nointr_nofail(pipefd[1]);
1364                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1365                         close_nointr_nofail(pipefd[0]);
1366
1367                         close_nointr_nofail(master);
1368                         master = -1;
1369
1370                         if (saved_attr_valid) {
1371                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1372                                         log_error("Failed to set terminal attributes: %m");
1373                                         goto child_fail;
1374                                 }
1375                         }
1376
1377                         close_nointr(STDIN_FILENO);
1378                         close_nointr(STDOUT_FILENO);
1379                         close_nointr(STDERR_FILENO);
1380
1381                         close_nointr_nofail(kmsg_socket_pair[0]);
1382                         kmsg_socket_pair[0] = -1;
1383
1384                         reset_all_signal_handlers();
1385
1386                         assert_se(sigemptyset(&mask) == 0);
1387                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1388
1389                         k = open_terminal(console, O_RDWR);
1390                         if (k != STDIN_FILENO) {
1391                                 if (k >= 0) {
1392                                         close_nointr_nofail(k);
1393                                         k = -EINVAL;
1394                                 }
1395
1396                                 log_error("Failed to open console: %s", strerror(-k));
1397                                 goto child_fail;
1398                         }
1399
1400                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1401                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1402                                 log_error("Failed to duplicate console: %m");
1403                                 goto child_fail;
1404                         }
1405
1406                         if (setsid() < 0) {
1407                                 log_error("setsid() failed: %m");
1408                                 goto child_fail;
1409                         }
1410
1411                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1412                                 log_error("PR_SET_PDEATHSIG failed: %m");
1413                                 goto child_fail;
1414                         }
1415
1416                         if (setup_cgroup(newcg) < 0)
1417                                 goto child_fail;
1418
1419                         /* Mark everything as slave, so that we still
1420                          * receive mounts from the real root, but don't
1421                          * propagate mounts to the real root. */
1422                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1423                                 log_error("MS_SLAVE|MS_REC failed: %m");
1424                                 goto child_fail;
1425                         }
1426
1427                         /* Turn directory into bind mount */
1428                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1429                                 log_error("Failed to make bind mount.");
1430                                 goto child_fail;
1431                         }
1432
1433                         if (arg_read_only)
1434                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1435                                         log_error("Failed to make read-only.");
1436                                         goto child_fail;
1437                                 }
1438
1439                         if (mount_all(arg_directory) < 0)
1440                                 goto child_fail;
1441
1442                         if (copy_devnodes(arg_directory) < 0)
1443                                 goto child_fail;
1444
1445                         if (setup_ptmx(arg_directory) < 0)
1446                                 goto child_fail;
1447
1448                         dev_setup(arg_directory);
1449
1450                         if (setup_dev_console(arg_directory, console) < 0)
1451                                 goto child_fail;
1452
1453                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1454                                 goto child_fail;
1455
1456                         close_nointr_nofail(kmsg_socket_pair[1]);
1457                         kmsg_socket_pair[1] = -1;
1458
1459                         if (setup_boot_id(arg_directory) < 0)
1460                                 goto child_fail;
1461
1462                         if (setup_timezone(arg_directory) < 0)
1463                                 goto child_fail;
1464
1465                         if (setup_resolv_conf(arg_directory) < 0)
1466                                 goto child_fail;
1467
1468                         if (setup_journal(arg_directory) < 0)
1469                                 goto child_fail;
1470
1471                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1472                                 goto child_fail;
1473
1474                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1475                                 goto child_fail;
1476
1477                         if (chdir(arg_directory) < 0) {
1478                                 log_error("chdir(%s) failed: %m", arg_directory);
1479                                 goto child_fail;
1480                         }
1481
1482                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1483                                 log_error("mount(MS_MOVE) failed: %m");
1484                                 goto child_fail;
1485                         }
1486
1487                         if (chroot(".") < 0) {
1488                                 log_error("chroot() failed: %m");
1489                                 goto child_fail;
1490                         }
1491
1492                         if (chdir("/") < 0) {
1493                                 log_error("chdir() failed: %m");
1494                                 goto child_fail;
1495                         }
1496
1497                         umask(0022);
1498
1499                         loopback_setup();
1500
1501                         if (drop_capabilities() < 0) {
1502                                 log_error("drop_capabilities() failed: %m");
1503                                 goto child_fail;
1504                         }
1505
1506                         if (arg_user) {
1507
1508                                 /* Note that this resolves user names
1509                                  * inside the container, and hence
1510                                  * accesses the NSS modules from the
1511                                  * container and not the host. This is
1512                                  * a bit weird... */
1513
1514                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1515                                         log_error("get_user_creds() failed: %m");
1516                                         goto child_fail;
1517                                 }
1518
1519                                 if (mkdir_parents_label(home, 0775) < 0) {
1520                                         log_error("mkdir_parents_label() failed: %m");
1521                                         goto child_fail;
1522                                 }
1523
1524                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1525                                         log_error("mkdir_safe_label() failed: %m");
1526                                         goto child_fail;
1527                                 }
1528
1529                                 if (initgroups((const char*)arg_user, gid) < 0) {
1530                                         log_error("initgroups() failed: %m");
1531                                         goto child_fail;
1532                                 }
1533
1534                                 if (setresgid(gid, gid, gid) < 0) {
1535                                         log_error("setregid() failed: %m");
1536                                         goto child_fail;
1537                                 }
1538
1539                                 if (setresuid(uid, uid, uid) < 0) {
1540                                         log_error("setreuid() failed: %m");
1541                                         goto child_fail;
1542                                 }
1543                         } else {
1544                                 /* Reset everything fully to 0, just in case */
1545
1546                                 if (setgroups(0, NULL) < 0) {
1547                                         log_error("setgroups() failed: %m");
1548                                         goto child_fail;
1549                                 }
1550
1551                                 if (setresgid(0, 0, 0) < 0) {
1552                                         log_error("setregid() failed: %m");
1553                                         goto child_fail;
1554                                 }
1555
1556                                 if (setresuid(0, 0, 0) < 0) {
1557                                         log_error("setreuid() failed: %m");
1558                                         goto child_fail;
1559                                 }
1560                         }
1561
1562                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1563                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1564                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1565                                 log_oom();
1566                                 goto child_fail;
1567                         }
1568
1569                         if (arg_uuid) {
1570                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1571                                         log_oom();
1572                                         goto child_fail;
1573                                 }
1574                         }
1575
1576                         if (fdset_size(fds) > 0) {
1577                                 k = fdset_cloexec(fds, false);
1578                                 if (k < 0) {
1579                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1580                                         goto child_fail;
1581                                 }
1582
1583                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1584                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1585                                         log_oom();
1586                                         goto child_fail;
1587                                 }
1588                         }
1589
1590                         setup_hostname();
1591
1592                         if (arg_boot) {
1593                                 char **a;
1594                                 size_t l;
1595
1596                                 /* Automatically search for the init system */
1597
1598                                 l = 1 + argc - optind;
1599                                 a = newa(char*, l + 1);
1600                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1601
1602                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1603                                 execve(a[0], a, (char**) envp);
1604
1605                                 a[0] = (char*) "/lib/systemd/systemd";
1606                                 execve(a[0], a, (char**) envp);
1607
1608                                 a[0] = (char*) "/sbin/init";
1609                                 execve(a[0], a, (char**) envp);
1610                         } else if (argc > optind)
1611                                 execvpe(argv[optind], argv + optind, (char**) envp);
1612                         else {
1613                                 chdir(home ? home : "/root");
1614                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1615                         }
1616
1617                         log_error("execv() failed: %m");
1618
1619                 child_fail:
1620                         _exit(EXIT_FAILURE);
1621                 }
1622
1623                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1624                 close_nointr_nofail(pipefd[0]);
1625                 close_nointr_nofail(pipefd[1]);
1626
1627                 fdset_free(fds);
1628                 fds = NULL;
1629
1630                 if (process_pty(master, pid, &mask) < 0)
1631                         goto finish;
1632
1633                 if (saved_attr_valid)
1634                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1635
1636                 r = wait_for_terminate(pid, &status);
1637                 if (r < 0) {
1638                         r = EXIT_FAILURE;
1639                         break;
1640                 }
1641
1642                 if (status.si_code == CLD_EXITED) {
1643                         if (status.si_status != 0) {
1644                                 log_error("Container failed with error code %i.", status.si_status);
1645                                 r = status.si_status;
1646                                 break;
1647                         }
1648
1649                         log_debug("Container exited successfully.");
1650                         break;
1651                 } else if (status.si_code == CLD_KILLED &&
1652                            status.si_status == SIGINT) {
1653                         log_info("Container has been shut down.");
1654                         r = 0;
1655                         break;
1656                 } else if (status.si_code == CLD_KILLED &&
1657                            status.si_status == SIGHUP) {
1658                         log_info("Container is being rebooted.");
1659                         continue;
1660                 } else if (status.si_code == CLD_KILLED ||
1661                            status.si_code == CLD_DUMPED) {
1662
1663                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1664                         r = EXIT_FAILURE;
1665                         break;
1666                 } else {
1667                         log_error("Container failed due to unknown reason.");
1668                         r = EXIT_FAILURE;
1669                         break;
1670                 }
1671         }
1672
1673 finish:
1674         if (saved_attr_valid)
1675                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1676
1677         close_pipe(kmsg_socket_pair);
1678
1679         if (newcg)
1680                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1681
1682         free(arg_directory);
1683         free(arg_machine);
1684         strv_free(arg_controllers);
1685
1686         fdset_free(fds);
1687
1688         return r;
1689 }