chiark / gitweb /
move _cleanup_ attribute in front of the type
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 #ifndef TTY_GID
64 #define TTY_GID 5
65 #endif
66
67 typedef enum LinkJournal {
68         LINK_NO,
69         LINK_AUTO,
70         LINK_HOST,
71         LINK_GUEST
72 } LinkJournal;
73
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static char *arg_machine = NULL;
79 static bool arg_private_network = false;
80 static bool arg_read_only = false;
81 static bool arg_boot = false;
82 static LinkJournal arg_link_journal = LINK_AUTO;
83 static uint64_t arg_retain =
84         (1ULL << CAP_CHOWN) |
85         (1ULL << CAP_DAC_OVERRIDE) |
86         (1ULL << CAP_DAC_READ_SEARCH) |
87         (1ULL << CAP_FOWNER) |
88         (1ULL << CAP_FSETID) |
89         (1ULL << CAP_IPC_OWNER) |
90         (1ULL << CAP_KILL) |
91         (1ULL << CAP_LEASE) |
92         (1ULL << CAP_LINUX_IMMUTABLE) |
93         (1ULL << CAP_NET_BIND_SERVICE) |
94         (1ULL << CAP_NET_BROADCAST) |
95         (1ULL << CAP_NET_RAW) |
96         (1ULL << CAP_SETGID) |
97         (1ULL << CAP_SETFCAP) |
98         (1ULL << CAP_SETPCAP) |
99         (1ULL << CAP_SETUID) |
100         (1ULL << CAP_SYS_ADMIN) |
101         (1ULL << CAP_SYS_CHROOT) |
102         (1ULL << CAP_SYS_NICE) |
103         (1ULL << CAP_SYS_PTRACE) |
104         (1ULL << CAP_SYS_TTY_CONFIG) |
105         (1ULL << CAP_SYS_RESOURCE) |
106         (1ULL << CAP_SYS_BOOT) |
107         (1ULL << CAP_AUDIT_WRITE) |
108         (1ULL << CAP_AUDIT_CONTROL);
109 static char **arg_bind = NULL;
110 static char **arg_bind_ro = NULL;
111
112 static int help(void) {
113
114         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
115                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
116                "  -h --help                Show this help\n"
117                "     --version             Print version string\n"
118                "  -D --directory=NAME      Root directory for the container\n"
119                "  -b --boot                Boot up full system (i.e. invoke init)\n"
120                "  -u --user=USER           Run the command under specified user or uid\n"
121                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
122                "                           cgroup hierarchies\n"
123                "     --uuid=UUID           Set a specific machine UUID for the container\n"
124                "  -M --machine=NAME        Set the machine name for the container\n"
125                "     --private-network     Disable network in container\n"
126                "     --read-only           Mount the root directory read-only\n"
127                "     --capability=CAP      In addition to the default, retain specified\n"
128                "                           capability\n"
129                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
130                "  -j                       Equivalent to --link-journal=host\n"
131                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
132                "                           the container\n"
133                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
134                program_invocation_short_name);
135
136         return 0;
137 }
138
139 static int parse_argv(int argc, char *argv[]) {
140
141         enum {
142                 ARG_VERSION = 0x100,
143                 ARG_PRIVATE_NETWORK,
144                 ARG_UUID,
145                 ARG_READ_ONLY,
146                 ARG_CAPABILITY,
147                 ARG_LINK_JOURNAL,
148                 ARG_BIND,
149                 ARG_BIND_RO
150         };
151
152         static const struct option options[] = {
153                 { "help",            no_argument,       NULL, 'h'                 },
154                 { "version",         no_argument,       NULL, ARG_VERSION         },
155                 { "directory",       required_argument, NULL, 'D'                 },
156                 { "user",            required_argument, NULL, 'u'                 },
157                 { "controllers",     required_argument, NULL, 'C'                 },
158                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
159                 { "boot",            no_argument,       NULL, 'b'                 },
160                 { "uuid",            required_argument, NULL, ARG_UUID            },
161                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
162                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
163                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
164                 { "bind",            required_argument, NULL, ARG_BIND            },
165                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
166                 { "machine",         required_argument, NULL, 'M'                 },
167                 { NULL,              0,                 NULL, 0                   }
168         };
169
170         int c;
171
172         assert(argc >= 0);
173         assert(argv);
174
175         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
176
177                 switch (c) {
178
179                 case 'h':
180                         help();
181                         return 0;
182
183                 case ARG_VERSION:
184                         puts(PACKAGE_STRING);
185                         puts(SYSTEMD_FEATURES);
186                         return 0;
187
188                 case 'D':
189                         free(arg_directory);
190                         arg_directory = canonicalize_file_name(optarg);
191                         if (!arg_directory) {
192                                 log_error("Failed to canonicalize root directory.");
193                                 return -ENOMEM;
194                         }
195
196                         break;
197
198                 case 'u':
199                         free(arg_user);
200                         arg_user = strdup(optarg);
201                         if (!arg_user)
202                                 return log_oom();
203
204                         break;
205
206                 case 'C':
207                         strv_free(arg_controllers);
208                         arg_controllers = strv_split(optarg, ",");
209                         if (!arg_controllers)
210                                 return log_oom();
211
212                         cg_shorten_controllers(arg_controllers);
213                         break;
214
215                 case ARG_PRIVATE_NETWORK:
216                         arg_private_network = true;
217                         break;
218
219                 case 'b':
220                         arg_boot = true;
221                         break;
222
223                 case ARG_UUID:
224                         arg_uuid = optarg;
225                         break;
226
227                 case 'M':
228                         if (!hostname_is_valid(optarg)) {
229                                 log_error("Invalid machine name: %s", optarg);
230                                 return -EINVAL;
231                         }
232
233                         free(arg_machine);
234                         arg_machine = strdup(optarg);
235                         if (!arg_machine)
236                                 return log_oom();
237
238                         break;
239
240                 case ARG_READ_ONLY:
241                         arg_read_only = true;
242                         break;
243
244                 case ARG_CAPABILITY: {
245                         char *state, *word;
246                         size_t length;
247
248                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
249                                 cap_value_t cap;
250                                 char *t;
251
252                                 t = strndup(word, length);
253                                 if (!t)
254                                         return log_oom();
255
256                                 if (cap_from_name(t, &cap) < 0) {
257                                         log_error("Failed to parse capability %s.", t);
258                                         free(t);
259                                         return -EINVAL;
260                                 }
261
262                                 free(t);
263                                 arg_retain |= 1ULL << (uint64_t) cap;
264                         }
265
266                         break;
267                 }
268
269                 case 'j':
270                         arg_link_journal = LINK_GUEST;
271                         break;
272
273                 case ARG_LINK_JOURNAL:
274                         if (streq(optarg, "auto"))
275                                 arg_link_journal = LINK_AUTO;
276                         else if (streq(optarg, "no"))
277                                 arg_link_journal = LINK_NO;
278                         else if (streq(optarg, "guest"))
279                                 arg_link_journal = LINK_GUEST;
280                         else if (streq(optarg, "host"))
281                                 arg_link_journal = LINK_HOST;
282                         else {
283                                 log_error("Failed to parse link journal mode %s", optarg);
284                                 return -EINVAL;
285                         }
286
287                         break;
288
289                 case ARG_BIND:
290                 case ARG_BIND_RO: {
291                         _cleanup_free_ char *a = NULL, *b = NULL;
292                         char *e;
293                         char ***x;
294                         int r;
295
296                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
297
298                         e = strchr(optarg, ':');
299                         if (e) {
300                                 a = strndup(optarg, e - optarg);
301                                 b = strdup(e + 1);
302                         } else {
303                                 a = strdup(optarg);
304                                 b = strdup(optarg);
305                         }
306
307                         if (!a || !b)
308                                 return log_oom();
309
310                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
311                                 log_error("Invalid bind mount specification: %s", optarg);
312                                 return -EINVAL;
313                         }
314
315                         r = strv_extend(x, a);
316                         if (r < 0)
317                                 return r;
318
319                         r = strv_extend(x, b);
320                         if (r < 0)
321                                 return r;
322
323                         break;
324                 }
325
326                 case '?':
327                         return -EINVAL;
328
329                 default:
330                         log_error("Unknown option code %c", c);
331                         return -EINVAL;
332                 }
333         }
334
335         return 1;
336 }
337
338 static int mount_all(const char *dest) {
339
340         typedef struct MountPoint {
341                 const char *what;
342                 const char *where;
343                 const char *type;
344                 const char *options;
345                 unsigned long flags;
346                 bool fatal;
347         } MountPoint;
348
349         static const MountPoint mount_table[] = {
350                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
351                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
352                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
353                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
354                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
355                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
357                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358 #ifdef HAVE_SELINUX
359                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
360                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
361 #endif
362         };
363
364         unsigned k;
365         int r = 0;
366
367         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368                 _cleanup_free_ char *where = NULL;
369                 int t;
370
371                 where = strjoin(dest, "/", mount_table[k].where, NULL);
372                 if (!where)
373                         return log_oom();
374
375                 t = path_is_mount_point(where, true);
376                 if (t < 0) {
377                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
378
379                         if (r == 0)
380                                 r = t;
381
382                         continue;
383                 }
384
385                 /* Skip this entry if it is not a remount. */
386                 if (mount_table[k].what && t > 0)
387                         continue;
388
389                 mkdir_p(where, 0755);
390
391                 if (mount(mount_table[k].what,
392                           where,
393                           mount_table[k].type,
394                           mount_table[k].flags,
395                           mount_table[k].options) < 0 &&
396                     mount_table[k].fatal) {
397
398                         log_error("mount(%s) failed: %m", where);
399
400                         if (r == 0)
401                                 r = -errno;
402                 }
403         }
404
405         return r;
406 }
407
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
409         char **x, **y;
410
411         STRV_FOREACH_PAIR(x, y, l) {
412                 _cleanup_free_ char *where = NULL;
413
414                 where = strjoin(dest, "/", *y, NULL);
415                 if (!where)
416                         return log_oom();
417
418                 mkdir_p_label(where, 0755);
419
420                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
421                         log_error("mount(%s) failed: %m", where);
422                         return -errno;
423                 }
424
425                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
426                         log_error("mount(%s) failed: %m", where);
427                         return -errno;
428                 }
429         }
430
431         return 0;
432 }
433
434 static int setup_timezone(const char *dest) {
435         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
436         char *z, *y;
437         int r;
438
439         assert(dest);
440
441         /* Fix the timezone, if possible */
442         r = readlink_malloc("/etc/localtime", &p);
443         if (r < 0) {
444                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
445                 return 0;
446         }
447
448         z = path_startswith(p, "../usr/share/zoneinfo/");
449         if (!z)
450                 z = path_startswith(p, "/usr/share/zoneinfo/");
451         if (!z) {
452                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
453                 return 0;
454         }
455
456         where = strappend(dest, "/etc/localtime");
457         if (!where)
458                 return log_oom();
459
460         r = readlink_malloc(where, &q);
461         if (r >= 0) {
462                 y = path_startswith(q, "../usr/share/zoneinfo/");
463                 if (!y)
464                         y = path_startswith(q, "/usr/share/zoneinfo/");
465
466
467                 /* Already pointing to the right place? Then do nothing .. */
468                 if (y && streq(y, z))
469                         return 0;
470         }
471
472         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
473         if (!check)
474                 return log_oom();
475
476         if (access(check, F_OK) < 0) {
477                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
478                 return 0;
479         }
480
481         what = strappend("../usr/share/zoneinfo/", z);
482         if (!what)
483                 return log_oom();
484
485         unlink(where);
486         if (symlink(what, where) < 0) {
487                 log_error("Failed to correct timezone of container: %m");
488                 return 0;
489         }
490
491         return 0;
492 }
493
494 static int setup_resolv_conf(const char *dest) {
495         char *where;
496
497         assert(dest);
498
499         if (arg_private_network)
500                 return 0;
501
502         /* Fix resolv.conf, if possible */
503         where = strappend(dest, "/etc/resolv.conf");
504         if (!where)
505                 return log_oom();
506
507         /* We don't really care for the results of this really. If it
508          * fails, it fails, but meh... */
509         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
510                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
511
512         free(where);
513
514         return 0;
515 }
516
517 static int setup_boot_id(const char *dest) {
518         _cleanup_free_ char *from = NULL, *to = NULL;
519         sd_id128_t rnd;
520         char as_uuid[37];
521         int r;
522
523         assert(dest);
524
525         /* Generate a new randomized boot ID, so that each boot-up of
526          * the container gets a new one */
527
528         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
529         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
530         if (!from || !to)
531                 return log_oom();
532
533         r = sd_id128_randomize(&rnd);
534         if (r < 0) {
535                 log_error("Failed to generate random boot id: %s", strerror(-r));
536                 return r;
537         }
538
539         snprintf(as_uuid, sizeof(as_uuid),
540                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
541                  SD_ID128_FORMAT_VAL(rnd));
542         char_array_0(as_uuid);
543
544         r = write_string_file(from, as_uuid);
545         if (r < 0) {
546                 log_error("Failed to write boot id: %s", strerror(-r));
547                 return r;
548         }
549
550         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
551                 log_error("Failed to bind mount boot id: %m");
552                 r = -errno;
553         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
554                 log_warning("Failed to make boot id read-only: %m");
555
556         unlink(from);
557         return r;
558 }
559
560 static int copy_devnodes(const char *dest) {
561
562         static const char devnodes[] =
563                 "null\0"
564                 "zero\0"
565                 "full\0"
566                 "random\0"
567                 "urandom\0"
568                 "tty\0";
569
570         const char *d;
571         int r = 0;
572         _cleanup_umask_ mode_t u;
573
574         assert(dest);
575
576         u = umask(0000);
577
578         NULSTR_FOREACH(d, devnodes) {
579                 struct stat st;
580                 _cleanup_free_ char *from = NULL, *to = NULL;
581
582                 asprintf(&from, "/dev/%s", d);
583                 asprintf(&to, "%s/dev/%s", dest, d);
584
585                 if (!from || !to) {
586                         log_oom();
587
588                         if (r == 0)
589                                 r = -ENOMEM;
590
591                         break;
592                 }
593
594                 if (stat(from, &st) < 0) {
595
596                         if (errno != ENOENT) {
597                                 log_error("Failed to stat %s: %m", from);
598                                 if (r == 0)
599                                         r = -errno;
600                         }
601
602                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
603
604                         log_error("%s is not a char or block device, cannot copy", from);
605                         if (r == 0)
606                                 r = -EIO;
607
608                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
609
610                         log_error("mknod(%s) failed: %m", dest);
611                         if (r == 0)
612                                 r = -errno;
613                 }
614         }
615
616         return r;
617 }
618
619 static int setup_ptmx(const char *dest) {
620         _cleanup_free_ char *p = NULL;
621
622         p = strappend(dest, "/dev/ptmx");
623         if (!p)
624                 return log_oom();
625
626         if (symlink("pts/ptmx", p) < 0) {
627                 log_error("Failed to create /dev/ptmx symlink: %m");
628                 return -errno;
629         }
630
631         return 0;
632 }
633
634 static int setup_dev_console(const char *dest, const char *console) {
635         struct stat st;
636         _cleanup_free_ char *to = NULL;
637         int r;
638         _cleanup_umask_ mode_t u;
639
640         assert(dest);
641         assert(console);
642
643         u = umask(0000);
644
645         if (stat(console, &st) < 0) {
646                 log_error("Failed to stat %s: %m", console);
647                 return -errno;
648
649         } else if (!S_ISCHR(st.st_mode)) {
650                 log_error("/dev/console is not a char device");
651                 return -EIO;
652         }
653
654         r = chmod_and_chown(console, 0600, 0, 0);
655         if (r < 0) {
656                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
657                 return r;
658         }
659
660         if (asprintf(&to, "%s/dev/console", dest) < 0)
661                 return log_oom();
662
663         /* We need to bind mount the right tty to /dev/console since
664          * ptys can only exist on pts file systems. To have something
665          * to bind mount things on we create a device node first, that
666          * has the right major/minor (note that the major minor
667          * doesn't actually matter here, since we mount it over
668          * anyway). */
669
670         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
671                 log_error("mknod() for /dev/console failed: %m");
672                 return -errno;
673         }
674
675         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
676                 log_error("Bind mount for /dev/console failed: %m");
677                 return -errno;
678         }
679
680         return 0;
681 }
682
683 static int setup_kmsg(const char *dest, int kmsg_socket) {
684         _cleanup_free_ char *from = NULL, *to = NULL;
685         int r, fd, k;
686         _cleanup_umask_ mode_t u;
687         union {
688                 struct cmsghdr cmsghdr;
689                 uint8_t buf[CMSG_SPACE(sizeof(int))];
690         } control = {};
691         struct msghdr mh = {
692                 .msg_control = &control,
693                 .msg_controllen = sizeof(control),
694         };
695         struct cmsghdr *cmsg;
696
697         assert(dest);
698         assert(kmsg_socket >= 0);
699
700         u = umask(0000);
701
702         /* We create the kmsg FIFO as /dev/kmsg, but immediately
703          * delete it after bind mounting it to /proc/kmsg. While FIFOs
704          * on the reading side behave very similar to /proc/kmsg,
705          * their writing side behaves differently from /dev/kmsg in
706          * that writing blocks when nothing is reading. In order to
707          * avoid any problems with containers deadlocking due to this
708          * we simply make /dev/kmsg unavailable to the container. */
709         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
710             asprintf(&to, "%s/proc/kmsg", dest) < 0)
711                 return log_oom();
712
713         if (mkfifo(from, 0600) < 0) {
714                 log_error("mkfifo() for /dev/kmsg failed: %m");
715                 return -errno;
716         }
717
718         r = chmod_and_chown(from, 0600, 0, 0);
719         if (r < 0) {
720                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
721                 return r;
722         }
723
724         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
725                 log_error("Bind mount for /proc/kmsg failed: %m");
726                 return -errno;
727         }
728
729         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
730         if (fd < 0) {
731                 log_error("Failed to open fifo: %m");
732                 return -errno;
733         }
734
735         cmsg = CMSG_FIRSTHDR(&mh);
736         cmsg->cmsg_level = SOL_SOCKET;
737         cmsg->cmsg_type = SCM_RIGHTS;
738         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
739         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
740
741         mh.msg_controllen = cmsg->cmsg_len;
742
743         /* Store away the fd in the socket, so that it stays open as
744          * long as we run the child */
745         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
746         close_nointr_nofail(fd);
747
748         if (k < 0) {
749                 log_error("Failed to send FIFO fd: %m");
750                 return -errno;
751         }
752
753         /* And now make the FIFO unavailable as /dev/kmsg... */
754         unlink(from);
755         return 0;
756 }
757
758 static int setup_hostname(void) {
759
760         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
761                 return -errno;
762
763         return 0;
764 }
765
766 static int setup_journal(const char *directory) {
767         sd_id128_t machine_id;
768         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
769         char *id;
770         int r;
771
772         if (arg_link_journal == LINK_NO)
773                 return 0;
774
775         p = strappend(directory, "/etc/machine-id");
776         if (!p)
777                 return log_oom();
778
779         r = read_one_line_file(p, &b);
780         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
781                 return 0;
782         else if (r < 0) {
783                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
784                 return r;
785         }
786
787         id = strstrip(b);
788         if (isempty(id) && arg_link_journal == LINK_AUTO)
789                 return 0;
790
791         /* Verify validity */
792         r = sd_id128_from_string(id, &machine_id);
793         if (r < 0) {
794                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
795                 return r;
796         }
797
798         free(p);
799         p = strappend("/var/log/journal/", id);
800         q = strjoin(directory, "/var/log/journal/", id, NULL);
801         if (!p || !q)
802                 return log_oom();
803
804         if (path_is_mount_point(p, false) > 0) {
805                 if (arg_link_journal != LINK_AUTO) {
806                         log_error("%s: already a mount point, refusing to use for journal", p);
807                         return -EEXIST;
808                 }
809
810                 return 0;
811         }
812
813         if (path_is_mount_point(q, false) > 0) {
814                 if (arg_link_journal != LINK_AUTO) {
815                         log_error("%s: already a mount point, refusing to use for journal", q);
816                         return -EEXIST;
817                 }
818
819                 return 0;
820         }
821
822         r = readlink_and_make_absolute(p, &d);
823         if (r >= 0) {
824                 if ((arg_link_journal == LINK_GUEST ||
825                      arg_link_journal == LINK_AUTO) &&
826                     path_equal(d, q)) {
827
828                         r = mkdir_p(q, 0755);
829                         if (r < 0)
830                                 log_warning("failed to create directory %s: %m", q);
831                         return 0;
832                 }
833
834                 if (unlink(p) < 0) {
835                         log_error("Failed to remove symlink %s: %m", p);
836                         return -errno;
837                 }
838         } else if (r == -EINVAL) {
839
840                 if (arg_link_journal == LINK_GUEST &&
841                     rmdir(p) < 0) {
842
843                         if (errno == ENOTDIR) {
844                                 log_error("%s already exists and is neither a symlink nor a directory", p);
845                                 return r;
846                         } else {
847                                 log_error("Failed to remove %s: %m", p);
848                                 return -errno;
849                         }
850                 }
851         } else if (r != -ENOENT) {
852                 log_error("readlink(%s) failed: %m", p);
853                 return r;
854         }
855
856         if (arg_link_journal == LINK_GUEST) {
857
858                 if (symlink(q, p) < 0) {
859                         log_error("Failed to symlink %s to %s: %m", q, p);
860                         return -errno;
861                 }
862
863                 r = mkdir_p(q, 0755);
864                 if (r < 0)
865                         log_warning("failed to create directory %s: %m", q);
866                 return 0;
867         }
868
869         if (arg_link_journal == LINK_HOST) {
870                 r = mkdir_p(p, 0755);
871                 if (r < 0) {
872                         log_error("Failed to create %s: %m", p);
873                         return r;
874                 }
875
876         } else if (access(p, F_OK) < 0)
877                 return 0;
878
879         if (dir_is_empty(q) == 0) {
880                 log_error("%s not empty.", q);
881                 return -ENOTEMPTY;
882         }
883
884         r = mkdir_p(q, 0755);
885         if (r < 0) {
886                 log_error("Failed to create %s: %m", q);
887                 return r;
888         }
889
890         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
891                 log_error("Failed to bind mount journal from host into guest: %m");
892                 return -errno;
893         }
894
895         return 0;
896 }
897
898 static int setup_cgroup(const char *path) {
899         char **c;
900         int r;
901
902         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
903         if (r < 0) {
904                 log_error("Failed to create cgroup: %s", strerror(-r));
905                 return r;
906         }
907
908         STRV_FOREACH(c, arg_controllers) {
909                 r = cg_create_and_attach(*c, path, 1);
910                 if (r < 0)
911                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
912         }
913
914         return 0;
915 }
916
917 static int drop_capabilities(void) {
918         return capability_bounding_set_drop(~arg_retain, false);
919 }
920
921 static int process_pty(int master, pid_t pid, sigset_t *mask) {
922
923         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
924         size_t in_buffer_full = 0, out_buffer_full = 0;
925         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
926         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
927         int ep = -1, signal_fd = -1, r;
928         bool tried_orderly_shutdown = false;
929
930         assert(master >= 0);
931         assert(pid > 0);
932         assert(mask);
933
934         fd_nonblock(STDIN_FILENO, 1);
935         fd_nonblock(STDOUT_FILENO, 1);
936         fd_nonblock(master, 1);
937
938         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
939         if (signal_fd < 0) {
940                 log_error("signalfd(): %m");
941                 r = -errno;
942                 goto finish;
943         }
944
945         ep = epoll_create1(EPOLL_CLOEXEC);
946         if (ep < 0) {
947                 log_error("Failed to create epoll: %m");
948                 r = -errno;
949                 goto finish;
950         }
951
952         /* We read from STDIN only if this is actually a TTY,
953          * otherwise we assume non-interactivity. */
954         if (isatty(STDIN_FILENO)) {
955                 zero(stdin_ev);
956                 stdin_ev.events = EPOLLIN|EPOLLET;
957                 stdin_ev.data.fd = STDIN_FILENO;
958
959                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
960                         log_error("Failed to register STDIN in epoll: %m");
961                         r = -errno;
962                         goto finish;
963                 }
964         }
965
966         zero(stdout_ev);
967         stdout_ev.events = EPOLLOUT|EPOLLET;
968         stdout_ev.data.fd = STDOUT_FILENO;
969
970         zero(master_ev);
971         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
972         master_ev.data.fd = master;
973
974         zero(signal_ev);
975         signal_ev.events = EPOLLIN;
976         signal_ev.data.fd = signal_fd;
977
978         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
979                 if (errno != EPERM) {
980                         log_error("Failed to register stdout in epoll: %m");
981                         r = -errno;
982                         goto finish;
983                 }
984                 /* stdout without epoll support. Likely redirected to regular file. */
985                 stdout_writable = true;
986         }
987
988         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
989             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
990                 log_error("Failed to register fds in epoll: %m");
991                 r = -errno;
992                 goto finish;
993         }
994
995         for (;;) {
996                 struct epoll_event ev[16];
997                 ssize_t k;
998                 int i, nfds;
999
1000                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1001                 if (nfds < 0) {
1002
1003                         if (errno == EINTR || errno == EAGAIN)
1004                                 continue;
1005
1006                         log_error("epoll_wait(): %m");
1007                         r = -errno;
1008                         goto finish;
1009                 }
1010
1011                 assert(nfds >= 1);
1012
1013                 for (i = 0; i < nfds; i++) {
1014                         if (ev[i].data.fd == STDIN_FILENO) {
1015
1016                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1017                                         stdin_readable = true;
1018
1019                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1020
1021                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1022                                         stdout_writable = true;
1023
1024                         } else if (ev[i].data.fd == master) {
1025
1026                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1027                                         master_readable = true;
1028
1029                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1030                                         master_writable = true;
1031
1032                         } else if (ev[i].data.fd == signal_fd) {
1033                                 struct signalfd_siginfo sfsi;
1034                                 ssize_t n;
1035
1036                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1037                                 if (n != sizeof(sfsi)) {
1038
1039                                         if (n >= 0) {
1040                                                 log_error("Failed to read from signalfd: invalid block size");
1041                                                 r = -EIO;
1042                                                 goto finish;
1043                                         }
1044
1045                                         if (errno != EINTR && errno != EAGAIN) {
1046                                                 log_error("Failed to read from signalfd: %m");
1047                                                 r = -errno;
1048                                                 goto finish;
1049                                         }
1050                                 } else {
1051
1052                                         if (sfsi.ssi_signo == SIGWINCH) {
1053                                                 struct winsize ws;
1054
1055                                                 /* The window size changed, let's forward that. */
1056                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1057                                                         ioctl(master, TIOCSWINSZ, &ws);
1058                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1059
1060                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1061
1062                                                 /* This only works for systemd... */
1063                                                 tried_orderly_shutdown = true;
1064                                                 kill(pid, SIGRTMIN+3);
1065
1066                                         } else {
1067                                                 r = 0;
1068                                                 goto finish;
1069                                         }
1070                                 }
1071                         }
1072                 }
1073
1074                 while ((stdin_readable && in_buffer_full <= 0) ||
1075                        (master_writable && in_buffer_full > 0) ||
1076                        (master_readable && out_buffer_full <= 0) ||
1077                        (stdout_writable && out_buffer_full > 0)) {
1078
1079                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1080
1081                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1082                                 if (k < 0) {
1083
1084                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1085                                                 stdin_readable = false;
1086                                         else {
1087                                                 log_error("read(): %m");
1088                                                 r = -errno;
1089                                                 goto finish;
1090                                         }
1091                                 } else
1092                                         in_buffer_full += (size_t) k;
1093                         }
1094
1095                         if (master_writable && in_buffer_full > 0) {
1096
1097                                 k = write(master, in_buffer, in_buffer_full);
1098                                 if (k < 0) {
1099
1100                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1101                                                 master_writable = false;
1102                                         else {
1103                                                 log_error("write(): %m");
1104                                                 r = -errno;
1105                                                 goto finish;
1106                                         }
1107
1108                                 } else {
1109                                         assert(in_buffer_full >= (size_t) k);
1110                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1111                                         in_buffer_full -= k;
1112                                 }
1113                         }
1114
1115                         if (master_readable && out_buffer_full < LINE_MAX) {
1116
1117                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1118                                 if (k < 0) {
1119
1120                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1121                                                 master_readable = false;
1122                                         else {
1123                                                 log_error("read(): %m");
1124                                                 r = -errno;
1125                                                 goto finish;
1126                                         }
1127                                 }  else
1128                                         out_buffer_full += (size_t) k;
1129                         }
1130
1131                         if (stdout_writable && out_buffer_full > 0) {
1132
1133                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1134                                 if (k < 0) {
1135
1136                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1137                                                 stdout_writable = false;
1138                                         else {
1139                                                 log_error("write(): %m");
1140                                                 r = -errno;
1141                                                 goto finish;
1142                                         }
1143
1144                                 } else {
1145                                         assert(out_buffer_full >= (size_t) k);
1146                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1147                                         out_buffer_full -= k;
1148                                 }
1149                         }
1150                 }
1151         }
1152
1153 finish:
1154         if (ep >= 0)
1155                 close_nointr_nofail(ep);
1156
1157         if (signal_fd >= 0)
1158                 close_nointr_nofail(signal_fd);
1159
1160         return r;
1161 }
1162
1163 int main(int argc, char *argv[]) {
1164         pid_t pid = 0;
1165         int r = EXIT_FAILURE, k;
1166         _cleanup_free_ char *machine_root = NULL, *newcg = NULL;
1167         _cleanup_close_ int master = -1;
1168         int n_fd_passed;
1169         const char *console = NULL;
1170         struct termios saved_attr, raw_attr;
1171         sigset_t mask;
1172         bool saved_attr_valid = false;
1173         struct winsize ws;
1174         int kmsg_socket_pair[2] = { -1, -1 };
1175         FDSet *fds = NULL;
1176
1177         log_parse_environment();
1178         log_open();
1179
1180         r = parse_argv(argc, argv);
1181         if (r <= 0)
1182                 goto finish;
1183
1184         if (arg_directory) {
1185                 char *p;
1186
1187                 p = path_make_absolute_cwd(arg_directory);
1188                 free(arg_directory);
1189                 arg_directory = p;
1190         } else
1191                 arg_directory = get_current_dir_name();
1192
1193         if (!arg_directory) {
1194                 log_error("Failed to determine path");
1195                 goto finish;
1196         }
1197
1198         path_kill_slashes(arg_directory);
1199
1200         if (!arg_machine) {
1201                 arg_machine = strdup(path_get_file_name(arg_directory));
1202                 if (!arg_machine) {
1203                         log_oom();
1204                         goto finish;
1205                 }
1206
1207                 hostname_cleanup(arg_machine);
1208                 if (isempty(arg_machine)) {
1209                         log_error("Failed to determine machine name automatically, please use -M.");
1210                         goto finish;
1211                 }
1212         }
1213
1214         if (geteuid() != 0) {
1215                 log_error("Need to be root.");
1216                 goto finish;
1217         }
1218
1219         if (sd_booted() <= 0) {
1220                 log_error("Not running on a systemd system.");
1221                 goto finish;
1222         }
1223
1224         if (path_equal(arg_directory, "/")) {
1225                 log_error("Spawning container on root directory not supported.");
1226                 goto finish;
1227         }
1228
1229         if (path_is_os_tree(arg_directory) <= 0) {
1230                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1231                 goto finish;
1232         }
1233
1234         log_close();
1235         n_fd_passed = sd_listen_fds(false);
1236         if (n_fd_passed > 0) {
1237                 k = fdset_new_listen_fds(&fds, false);
1238                 if (k < 0) {
1239                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1240                         goto finish;
1241                 }
1242         }
1243         fdset_close_others(fds);
1244         log_open();
1245
1246         k = cg_get_machine_path(&machine_root);
1247         if (k < 0) {
1248                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1249                 goto finish;
1250         }
1251
1252         newcg = strjoin(machine_root, "/", arg_machine, NULL);
1253         if (!newcg) {
1254                 log_error("Failed to allocate cgroup path.");
1255                 goto finish;
1256         }
1257
1258         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1259         if (r <= 0 && r != -ENOENT) {
1260                 log_error("Container already running.");
1261
1262                 free(newcg);
1263                 newcg = NULL;
1264
1265                 goto finish;
1266         }
1267
1268         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1269         if (master < 0) {
1270                 log_error("Failed to acquire pseudo tty: %m");
1271                 goto finish;
1272         }
1273
1274         console = ptsname(master);
1275         if (!console) {
1276                 log_error("Failed to determine tty name: %m");
1277                 goto finish;
1278         }
1279
1280         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1281
1282         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1283                 ioctl(master, TIOCSWINSZ, &ws);
1284
1285         if (unlockpt(master) < 0) {
1286                 log_error("Failed to unlock tty: %m");
1287                 goto finish;
1288         }
1289
1290         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1291                 saved_attr_valid = true;
1292
1293                 raw_attr = saved_attr;
1294                 cfmakeraw(&raw_attr);
1295                 raw_attr.c_lflag &= ~ECHO;
1296         }
1297
1298         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1299                 log_error("Failed to create kmsg socket pair.");
1300                 goto finish;
1301         }
1302
1303         assert_se(sigemptyset(&mask) == 0);
1304         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1305         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1306
1307         for (;;) {
1308                 siginfo_t status;
1309                 int pipefd[2];
1310
1311                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1312                         log_error("pipe2(): %m");
1313                         goto finish;
1314                 }
1315
1316                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1317                 if (pid < 0) {
1318                         if (errno == EINVAL)
1319                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1320                         else
1321                                 log_error("clone() failed: %m");
1322
1323                         goto finish;
1324                 }
1325
1326                 if (pid == 0) {
1327                         /* child */
1328                         const char *home = NULL;
1329                         uid_t uid = (uid_t) -1;
1330                         gid_t gid = (gid_t) -1;
1331                         unsigned n_env = 2;
1332                         const char *envp[] = {
1333                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1334                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1335                                 NULL, /* TERM */
1336                                 NULL, /* HOME */
1337                                 NULL, /* USER */
1338                                 NULL, /* LOGNAME */
1339                                 NULL, /* container_uuid */
1340                                 NULL, /* LISTEN_FDS */
1341                                 NULL, /* LISTEN_PID */
1342                                 NULL
1343                         };
1344
1345                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1346                         if (envp[n_env])
1347                                 n_env ++;
1348
1349                         close_nointr_nofail(pipefd[1]);
1350                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1351                         close_nointr_nofail(pipefd[0]);
1352
1353                         close_nointr_nofail(master);
1354                         master = -1;
1355
1356                         if (saved_attr_valid) {
1357                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1358                                         log_error("Failed to set terminal attributes: %m");
1359                                         goto child_fail;
1360                                 }
1361                         }
1362
1363                         close_nointr(STDIN_FILENO);
1364                         close_nointr(STDOUT_FILENO);
1365                         close_nointr(STDERR_FILENO);
1366
1367                         close_nointr_nofail(kmsg_socket_pair[0]);
1368                         kmsg_socket_pair[0] = -1;
1369
1370                         reset_all_signal_handlers();
1371
1372                         assert_se(sigemptyset(&mask) == 0);
1373                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1374
1375                         k = open_terminal(console, O_RDWR);
1376                         if (k != STDIN_FILENO) {
1377                                 if (k >= 0) {
1378                                         close_nointr_nofail(k);
1379                                         k = -EINVAL;
1380                                 }
1381
1382                                 log_error("Failed to open console: %s", strerror(-k));
1383                                 goto child_fail;
1384                         }
1385
1386                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1387                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1388                                 log_error("Failed to duplicate console: %m");
1389                                 goto child_fail;
1390                         }
1391
1392                         if (setsid() < 0) {
1393                                 log_error("setsid() failed: %m");
1394                                 goto child_fail;
1395                         }
1396
1397                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1398                                 log_error("PR_SET_PDEATHSIG failed: %m");
1399                                 goto child_fail;
1400                         }
1401
1402                         if (setup_cgroup(newcg) < 0)
1403                                 goto child_fail;
1404
1405                         /* Mark everything as slave, so that we still
1406                          * receive mounts from the real root, but don't
1407                          * propagate mounts to the real root. */
1408                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1409                                 log_error("MS_SLAVE|MS_REC failed: %m");
1410                                 goto child_fail;
1411                         }
1412
1413                         /* Turn directory into bind mount */
1414                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1415                                 log_error("Failed to make bind mount.");
1416                                 goto child_fail;
1417                         }
1418
1419                         if (arg_read_only)
1420                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1421                                         log_error("Failed to make read-only.");
1422                                         goto child_fail;
1423                                 }
1424
1425                         if (mount_all(arg_directory) < 0)
1426                                 goto child_fail;
1427
1428                         if (copy_devnodes(arg_directory) < 0)
1429                                 goto child_fail;
1430
1431                         if (setup_ptmx(arg_directory) < 0)
1432                                 goto child_fail;
1433
1434                         dev_setup(arg_directory);
1435
1436                         if (setup_dev_console(arg_directory, console) < 0)
1437                                 goto child_fail;
1438
1439                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1440                                 goto child_fail;
1441
1442                         close_nointr_nofail(kmsg_socket_pair[1]);
1443                         kmsg_socket_pair[1] = -1;
1444
1445                         if (setup_boot_id(arg_directory) < 0)
1446                                 goto child_fail;
1447
1448                         if (setup_timezone(arg_directory) < 0)
1449                                 goto child_fail;
1450
1451                         if (setup_resolv_conf(arg_directory) < 0)
1452                                 goto child_fail;
1453
1454                         if (setup_journal(arg_directory) < 0)
1455                                 goto child_fail;
1456
1457                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1458                                 goto child_fail;
1459
1460                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1461                                 goto child_fail;
1462
1463                         if (chdir(arg_directory) < 0) {
1464                                 log_error("chdir(%s) failed: %m", arg_directory);
1465                                 goto child_fail;
1466                         }
1467
1468                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1469                                 log_error("mount(MS_MOVE) failed: %m");
1470                                 goto child_fail;
1471                         }
1472
1473                         if (chroot(".") < 0) {
1474                                 log_error("chroot() failed: %m");
1475                                 goto child_fail;
1476                         }
1477
1478                         if (chdir("/") < 0) {
1479                                 log_error("chdir() failed: %m");
1480                                 goto child_fail;
1481                         }
1482
1483                         umask(0022);
1484
1485                         loopback_setup();
1486
1487                         if (drop_capabilities() < 0) {
1488                                 log_error("drop_capabilities() failed: %m");
1489                                 goto child_fail;
1490                         }
1491
1492                         if (arg_user) {
1493
1494                                 /* Note that this resolves user names
1495                                  * inside the container, and hence
1496                                  * accesses the NSS modules from the
1497                                  * container and not the host. This is
1498                                  * a bit weird... */
1499
1500                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1501                                         log_error("get_user_creds() failed: %m");
1502                                         goto child_fail;
1503                                 }
1504
1505                                 if (mkdir_parents_label(home, 0775) < 0) {
1506                                         log_error("mkdir_parents_label() failed: %m");
1507                                         goto child_fail;
1508                                 }
1509
1510                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1511                                         log_error("mkdir_safe_label() failed: %m");
1512                                         goto child_fail;
1513                                 }
1514
1515                                 if (initgroups((const char*)arg_user, gid) < 0) {
1516                                         log_error("initgroups() failed: %m");
1517                                         goto child_fail;
1518                                 }
1519
1520                                 if (setresgid(gid, gid, gid) < 0) {
1521                                         log_error("setregid() failed: %m");
1522                                         goto child_fail;
1523                                 }
1524
1525                                 if (setresuid(uid, uid, uid) < 0) {
1526                                         log_error("setreuid() failed: %m");
1527                                         goto child_fail;
1528                                 }
1529                         } else {
1530                                 /* Reset everything fully to 0, just in case */
1531
1532                                 if (setgroups(0, NULL) < 0) {
1533                                         log_error("setgroups() failed: %m");
1534                                         goto child_fail;
1535                                 }
1536
1537                                 if (setresgid(0, 0, 0) < 0) {
1538                                         log_error("setregid() failed: %m");
1539                                         goto child_fail;
1540                                 }
1541
1542                                 if (setresuid(0, 0, 0) < 0) {
1543                                         log_error("setreuid() failed: %m");
1544                                         goto child_fail;
1545                                 }
1546                         }
1547
1548                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1549                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1550                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1551                                 log_oom();
1552                                 goto child_fail;
1553                         }
1554
1555                         if (arg_uuid) {
1556                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1557                                         log_oom();
1558                                         goto child_fail;
1559                                 }
1560                         }
1561
1562                         if (fdset_size(fds) > 0) {
1563                                 k = fdset_cloexec(fds, false);
1564                                 if (k < 0) {
1565                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1566                                         goto child_fail;
1567                                 }
1568
1569                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1570                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1571                                         log_oom();
1572                                         goto child_fail;
1573                                 }
1574                         }
1575
1576                         setup_hostname();
1577
1578                         if (arg_boot) {
1579                                 char **a;
1580                                 size_t l;
1581
1582                                 /* Automatically search for the init system */
1583
1584                                 l = 1 + argc - optind;
1585                                 a = newa(char*, l + 1);
1586                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1587
1588                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1589                                 execve(a[0], a, (char**) envp);
1590
1591                                 a[0] = (char*) "/lib/systemd/systemd";
1592                                 execve(a[0], a, (char**) envp);
1593
1594                                 a[0] = (char*) "/sbin/init";
1595                                 execve(a[0], a, (char**) envp);
1596                         } else if (argc > optind)
1597                                 execvpe(argv[optind], argv + optind, (char**) envp);
1598                         else {
1599                                 chdir(home ? home : "/root");
1600                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1601                         }
1602
1603                         log_error("execv() failed: %m");
1604
1605                 child_fail:
1606                         _exit(EXIT_FAILURE);
1607                 }
1608
1609                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1610                 close_nointr_nofail(pipefd[0]);
1611                 close_nointr_nofail(pipefd[1]);
1612
1613                 fdset_free(fds);
1614                 fds = NULL;
1615
1616                 if (process_pty(master, pid, &mask) < 0)
1617                         goto finish;
1618
1619                 if (saved_attr_valid)
1620                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1621
1622                 r = wait_for_terminate(pid, &status);
1623                 if (r < 0) {
1624                         r = EXIT_FAILURE;
1625                         break;
1626                 }
1627
1628                 if (status.si_code == CLD_EXITED) {
1629                         if (status.si_status != 0) {
1630                                 log_error("Container failed with error code %i.", status.si_status);
1631                                 r = status.si_status;
1632                                 break;
1633                         }
1634
1635                         log_debug("Container exited successfully.");
1636                         break;
1637                 } else if (status.si_code == CLD_KILLED &&
1638                            status.si_status == SIGINT) {
1639                         log_info("Container has been shut down.");
1640                         r = 0;
1641                         break;
1642                 } else if (status.si_code == CLD_KILLED &&
1643                            status.si_status == SIGHUP) {
1644                         log_info("Container is being rebooted.");
1645                         continue;
1646                 } else if (status.si_code == CLD_KILLED ||
1647                            status.si_code == CLD_DUMPED) {
1648
1649                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1650                         r = EXIT_FAILURE;
1651                         break;
1652                 } else {
1653                         log_error("Container failed due to unknown reason.");
1654                         r = EXIT_FAILURE;
1655                         break;
1656                 }
1657         }
1658
1659 finish:
1660         if (saved_attr_valid)
1661                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1662
1663         close_pipe(kmsg_socket_pair);
1664
1665         if (newcg)
1666                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1667
1668         free(arg_directory);
1669         free(arg_machine);
1670         strv_free(arg_controllers);
1671
1672         fdset_free(fds);
1673
1674         return r;
1675 }