chiark / gitweb /
build-sys: support builds without EAs again
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #ifdef HAVE_XATTR
46 #include <attr/xattr.h>
47 #endif
48
49 #include <systemd/sd-daemon.h>
50
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "sd-id128.h"
62 #include "dev-setup.h"
63 #include "fdset.h"
64 #include "build.h"
65 #include "fileio.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72         LINK_NO,
73         LINK_AUTO,
74         LINK_HOST,
75         LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static char **arg_controllers = NULL;
81 static char *arg_uuid = NULL;
82 static char *arg_machine = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88         (1ULL << CAP_CHOWN) |
89         (1ULL << CAP_DAC_OVERRIDE) |
90         (1ULL << CAP_DAC_READ_SEARCH) |
91         (1ULL << CAP_FOWNER) |
92         (1ULL << CAP_FSETID) |
93         (1ULL << CAP_IPC_OWNER) |
94         (1ULL << CAP_KILL) |
95         (1ULL << CAP_LEASE) |
96         (1ULL << CAP_LINUX_IMMUTABLE) |
97         (1ULL << CAP_NET_BIND_SERVICE) |
98         (1ULL << CAP_NET_BROADCAST) |
99         (1ULL << CAP_NET_RAW) |
100         (1ULL << CAP_SETGID) |
101         (1ULL << CAP_SETFCAP) |
102         (1ULL << CAP_SETPCAP) |
103         (1ULL << CAP_SETUID) |
104         (1ULL << CAP_SYS_ADMIN) |
105         (1ULL << CAP_SYS_CHROOT) |
106         (1ULL << CAP_SYS_NICE) |
107         (1ULL << CAP_SYS_PTRACE) |
108         (1ULL << CAP_SYS_TTY_CONFIG) |
109         (1ULL << CAP_SYS_RESOURCE) |
110         (1ULL << CAP_SYS_BOOT) |
111         (1ULL << CAP_AUDIT_WRITE) |
112         (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120                "  -h --help                Show this help\n"
121                "     --version             Print version string\n"
122                "  -D --directory=NAME      Root directory for the container\n"
123                "  -b --boot                Boot up full system (i.e. invoke init)\n"
124                "  -u --user=USER           Run the command under specified user or uid\n"
125                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
126                "                           cgroup hierarchies\n"
127                "     --uuid=UUID           Set a specific machine UUID for the container\n"
128                "  -M --machine=NAME        Set the machine name for the container\n"
129                "     --private-network     Disable network in container\n"
130                "     --read-only           Mount the root directory read-only\n"
131                "     --capability=CAP      In addition to the default, retain specified\n"
132                "                           capability\n"
133                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
134                "  -j                       Equivalent to --link-journal=host\n"
135                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
136                "                           the container\n"
137                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
138                program_invocation_short_name);
139
140         return 0;
141 }
142
143 static int parse_argv(int argc, char *argv[]) {
144
145         enum {
146                 ARG_VERSION = 0x100,
147                 ARG_PRIVATE_NETWORK,
148                 ARG_UUID,
149                 ARG_READ_ONLY,
150                 ARG_CAPABILITY,
151                 ARG_LINK_JOURNAL,
152                 ARG_BIND,
153                 ARG_BIND_RO
154         };
155
156         static const struct option options[] = {
157                 { "help",            no_argument,       NULL, 'h'                 },
158                 { "version",         no_argument,       NULL, ARG_VERSION         },
159                 { "directory",       required_argument, NULL, 'D'                 },
160                 { "user",            required_argument, NULL, 'u'                 },
161                 { "controllers",     required_argument, NULL, 'C'                 },
162                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
163                 { "boot",            no_argument,       NULL, 'b'                 },
164                 { "uuid",            required_argument, NULL, ARG_UUID            },
165                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
166                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
167                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
168                 { "bind",            required_argument, NULL, ARG_BIND            },
169                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
170                 { "machine",         required_argument, NULL, 'M'                 },
171                 { NULL,              0,                 NULL, 0                   }
172         };
173
174         int c;
175
176         assert(argc >= 0);
177         assert(argv);
178
179         while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
180
181                 switch (c) {
182
183                 case 'h':
184                         help();
185                         return 0;
186
187                 case ARG_VERSION:
188                         puts(PACKAGE_STRING);
189                         puts(SYSTEMD_FEATURES);
190                         return 0;
191
192                 case 'D':
193                         free(arg_directory);
194                         arg_directory = canonicalize_file_name(optarg);
195                         if (!arg_directory) {
196                                 log_error("Failed to canonicalize root directory.");
197                                 return -ENOMEM;
198                         }
199
200                         break;
201
202                 case 'u':
203                         free(arg_user);
204                         arg_user = strdup(optarg);
205                         if (!arg_user)
206                                 return log_oom();
207
208                         break;
209
210                 case 'C':
211                         strv_free(arg_controllers);
212                         arg_controllers = strv_split(optarg, ",");
213                         if (!arg_controllers)
214                                 return log_oom();
215
216                         cg_shorten_controllers(arg_controllers);
217                         break;
218
219                 case ARG_PRIVATE_NETWORK:
220                         arg_private_network = true;
221                         break;
222
223                 case 'b':
224                         arg_boot = true;
225                         break;
226
227                 case ARG_UUID:
228                         if (!id128_is_valid(optarg)) {
229                                 log_error("Invalid UUID: %s", optarg);
230                                 return -EINVAL;
231                         }
232
233                         arg_uuid = optarg;
234                         break;
235
236                 case 'M':
237                         if (!hostname_is_valid(optarg)) {
238                                 log_error("Invalid machine name: %s", optarg);
239                                 return -EINVAL;
240                         }
241
242                         free(arg_machine);
243                         arg_machine = strdup(optarg);
244                         if (!arg_machine)
245                                 return log_oom();
246
247                         break;
248
249                 case ARG_READ_ONLY:
250                         arg_read_only = true;
251                         break;
252
253                 case ARG_CAPABILITY: {
254                         char *state, *word;
255                         size_t length;
256
257                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
258                                 cap_value_t cap;
259                                 char *t;
260
261                                 t = strndup(word, length);
262                                 if (!t)
263                                         return log_oom();
264
265                                 if (cap_from_name(t, &cap) < 0) {
266                                         log_error("Failed to parse capability %s.", t);
267                                         free(t);
268                                         return -EINVAL;
269                                 }
270
271                                 free(t);
272                                 arg_retain |= 1ULL << (uint64_t) cap;
273                         }
274
275                         break;
276                 }
277
278                 case 'j':
279                         arg_link_journal = LINK_GUEST;
280                         break;
281
282                 case ARG_LINK_JOURNAL:
283                         if (streq(optarg, "auto"))
284                                 arg_link_journal = LINK_AUTO;
285                         else if (streq(optarg, "no"))
286                                 arg_link_journal = LINK_NO;
287                         else if (streq(optarg, "guest"))
288                                 arg_link_journal = LINK_GUEST;
289                         else if (streq(optarg, "host"))
290                                 arg_link_journal = LINK_HOST;
291                         else {
292                                 log_error("Failed to parse link journal mode %s", optarg);
293                                 return -EINVAL;
294                         }
295
296                         break;
297
298                 case ARG_BIND:
299                 case ARG_BIND_RO: {
300                         _cleanup_free_ char *a = NULL, *b = NULL;
301                         char *e;
302                         char ***x;
303                         int r;
304
305                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
306
307                         e = strchr(optarg, ':');
308                         if (e) {
309                                 a = strndup(optarg, e - optarg);
310                                 b = strdup(e + 1);
311                         } else {
312                                 a = strdup(optarg);
313                                 b = strdup(optarg);
314                         }
315
316                         if (!a || !b)
317                                 return log_oom();
318
319                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
320                                 log_error("Invalid bind mount specification: %s", optarg);
321                                 return -EINVAL;
322                         }
323
324                         r = strv_extend(x, a);
325                         if (r < 0)
326                                 return r;
327
328                         r = strv_extend(x, b);
329                         if (r < 0)
330                                 return r;
331
332                         break;
333                 }
334
335                 case '?':
336                         return -EINVAL;
337
338                 default:
339                         log_error("Unknown option code %c", c);
340                         return -EINVAL;
341                 }
342         }
343
344         return 1;
345 }
346
347 static int mount_all(const char *dest) {
348
349         typedef struct MountPoint {
350                 const char *what;
351                 const char *where;
352                 const char *type;
353                 const char *options;
354                 unsigned long flags;
355                 bool fatal;
356         } MountPoint;
357
358         static const MountPoint mount_table[] = {
359                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
361                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
362                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
363                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
364                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
366                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
367 #ifdef HAVE_SELINUX
368                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
369                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
370 #endif
371         };
372
373         unsigned k;
374         int r = 0;
375
376         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377                 _cleanup_free_ char *where = NULL;
378                 int t;
379
380                 where = strjoin(dest, "/", mount_table[k].where, NULL);
381                 if (!where)
382                         return log_oom();
383
384                 t = path_is_mount_point(where, true);
385                 if (t < 0) {
386                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387
388                         if (r == 0)
389                                 r = t;
390
391                         continue;
392                 }
393
394                 /* Skip this entry if it is not a remount. */
395                 if (mount_table[k].what && t > 0)
396                         continue;
397
398                 mkdir_p(where, 0755);
399
400                 if (mount(mount_table[k].what,
401                           where,
402                           mount_table[k].type,
403                           mount_table[k].flags,
404                           mount_table[k].options) < 0 &&
405                     mount_table[k].fatal) {
406
407                         log_error("mount(%s) failed: %m", where);
408
409                         if (r == 0)
410                                 r = -errno;
411                 }
412         }
413
414         return r;
415 }
416
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
418         char **x, **y;
419
420         STRV_FOREACH_PAIR(x, y, l) {
421                 _cleanup_free_ char *where = NULL;
422
423                 where = strjoin(dest, "/", *y, NULL);
424                 if (!where)
425                         return log_oom();
426
427                 mkdir_p_label(where, 0755);
428
429                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
430                         log_error("mount(%s) failed: %m", where);
431                         return -errno;
432                 }
433
434                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
435                         log_error("mount(%s) failed: %m", where);
436                         return -errno;
437                 }
438         }
439
440         return 0;
441 }
442
443 static int setup_timezone(const char *dest) {
444         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
445         char *z, *y;
446         int r;
447
448         assert(dest);
449
450         /* Fix the timezone, if possible */
451         r = readlink_malloc("/etc/localtime", &p);
452         if (r < 0) {
453                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
454                 return 0;
455         }
456
457         z = path_startswith(p, "../usr/share/zoneinfo/");
458         if (!z)
459                 z = path_startswith(p, "/usr/share/zoneinfo/");
460         if (!z) {
461                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
462                 return 0;
463         }
464
465         where = strappend(dest, "/etc/localtime");
466         if (!where)
467                 return log_oom();
468
469         r = readlink_malloc(where, &q);
470         if (r >= 0) {
471                 y = path_startswith(q, "../usr/share/zoneinfo/");
472                 if (!y)
473                         y = path_startswith(q, "/usr/share/zoneinfo/");
474
475
476                 /* Already pointing to the right place? Then do nothing .. */
477                 if (y && streq(y, z))
478                         return 0;
479         }
480
481         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
482         if (!check)
483                 return log_oom();
484
485         if (access(check, F_OK) < 0) {
486                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
487                 return 0;
488         }
489
490         what = strappend("../usr/share/zoneinfo/", z);
491         if (!what)
492                 return log_oom();
493
494         unlink(where);
495         if (symlink(what, where) < 0) {
496                 log_error("Failed to correct timezone of container: %m");
497                 return 0;
498         }
499
500         return 0;
501 }
502
503 static int setup_resolv_conf(const char *dest) {
504         char _cleanup_free_ *where = NULL;
505         _cleanup_close_ int fd = -1;
506
507         assert(dest);
508
509         if (arg_private_network)
510                 return 0;
511
512         /* Fix resolv.conf, if possible */
513         where = strappend(dest, "/etc/resolv.conf");
514         if (!where)
515                 return log_oom();
516
517         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
518
519         /* We don't really care for the results of this really. If it
520          * fails, it fails, but meh... */
521         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
522                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
523         else
524                 if (mount("/etc/resolv.conf", where, "bind",
525                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
526                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
527                         return -errno;
528                 }
529
530         return 0;
531 }
532
533 static int setup_boot_id(const char *dest) {
534         _cleanup_free_ char *from = NULL, *to = NULL;
535         sd_id128_t rnd;
536         char as_uuid[37];
537         int r;
538
539         assert(dest);
540
541         /* Generate a new randomized boot ID, so that each boot-up of
542          * the container gets a new one */
543
544         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
545         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
546         if (!from || !to)
547                 return log_oom();
548
549         r = sd_id128_randomize(&rnd);
550         if (r < 0) {
551                 log_error("Failed to generate random boot id: %s", strerror(-r));
552                 return r;
553         }
554
555         snprintf(as_uuid, sizeof(as_uuid),
556                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
557                  SD_ID128_FORMAT_VAL(rnd));
558         char_array_0(as_uuid);
559
560         r = write_string_file(from, as_uuid);
561         if (r < 0) {
562                 log_error("Failed to write boot id: %s", strerror(-r));
563                 return r;
564         }
565
566         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
567                 log_error("Failed to bind mount boot id: %m");
568                 r = -errno;
569         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
570                 log_warning("Failed to make boot id read-only: %m");
571
572         unlink(from);
573         return r;
574 }
575
576 static int copy_devnodes(const char *dest) {
577
578         static const char devnodes[] =
579                 "null\0"
580                 "zero\0"
581                 "full\0"
582                 "random\0"
583                 "urandom\0"
584                 "tty\0";
585
586         const char *d;
587         int r = 0;
588         _cleanup_umask_ mode_t u;
589
590         assert(dest);
591
592         u = umask(0000);
593
594         NULSTR_FOREACH(d, devnodes) {
595                 struct stat st;
596                 _cleanup_free_ char *from = NULL, *to = NULL;
597
598                 asprintf(&from, "/dev/%s", d);
599                 asprintf(&to, "%s/dev/%s", dest, d);
600
601                 if (!from || !to) {
602                         log_oom();
603
604                         if (r == 0)
605                                 r = -ENOMEM;
606
607                         break;
608                 }
609
610                 if (stat(from, &st) < 0) {
611
612                         if (errno != ENOENT) {
613                                 log_error("Failed to stat %s: %m", from);
614                                 if (r == 0)
615                                         r = -errno;
616                         }
617
618                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
619
620                         log_error("%s is not a char or block device, cannot copy", from);
621                         if (r == 0)
622                                 r = -EIO;
623
624                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
625
626                         log_error("mknod(%s) failed: %m", dest);
627                         if (r == 0)
628                                 r = -errno;
629                 }
630         }
631
632         return r;
633 }
634
635 static int setup_ptmx(const char *dest) {
636         _cleanup_free_ char *p = NULL;
637
638         p = strappend(dest, "/dev/ptmx");
639         if (!p)
640                 return log_oom();
641
642         if (symlink("pts/ptmx", p) < 0) {
643                 log_error("Failed to create /dev/ptmx symlink: %m");
644                 return -errno;
645         }
646
647         return 0;
648 }
649
650 static int setup_dev_console(const char *dest, const char *console) {
651         struct stat st;
652         _cleanup_free_ char *to = NULL;
653         int r;
654         _cleanup_umask_ mode_t u;
655
656         assert(dest);
657         assert(console);
658
659         u = umask(0000);
660
661         if (stat(console, &st) < 0) {
662                 log_error("Failed to stat %s: %m", console);
663                 return -errno;
664
665         } else if (!S_ISCHR(st.st_mode)) {
666                 log_error("/dev/console is not a char device");
667                 return -EIO;
668         }
669
670         r = chmod_and_chown(console, 0600, 0, 0);
671         if (r < 0) {
672                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
673                 return r;
674         }
675
676         if (asprintf(&to, "%s/dev/console", dest) < 0)
677                 return log_oom();
678
679         /* We need to bind mount the right tty to /dev/console since
680          * ptys can only exist on pts file systems. To have something
681          * to bind mount things on we create a device node first, that
682          * has the right major/minor (note that the major minor
683          * doesn't actually matter here, since we mount it over
684          * anyway). */
685
686         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
687                 log_error("mknod() for /dev/console failed: %m");
688                 return -errno;
689         }
690
691         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
692                 log_error("Bind mount for /dev/console failed: %m");
693                 return -errno;
694         }
695
696         return 0;
697 }
698
699 static int setup_kmsg(const char *dest, int kmsg_socket) {
700         _cleanup_free_ char *from = NULL, *to = NULL;
701         int r, fd, k;
702         _cleanup_umask_ mode_t u;
703         union {
704                 struct cmsghdr cmsghdr;
705                 uint8_t buf[CMSG_SPACE(sizeof(int))];
706         } control = {};
707         struct msghdr mh = {
708                 .msg_control = &control,
709                 .msg_controllen = sizeof(control),
710         };
711         struct cmsghdr *cmsg;
712
713         assert(dest);
714         assert(kmsg_socket >= 0);
715
716         u = umask(0000);
717
718         /* We create the kmsg FIFO as /dev/kmsg, but immediately
719          * delete it after bind mounting it to /proc/kmsg. While FIFOs
720          * on the reading side behave very similar to /proc/kmsg,
721          * their writing side behaves differently from /dev/kmsg in
722          * that writing blocks when nothing is reading. In order to
723          * avoid any problems with containers deadlocking due to this
724          * we simply make /dev/kmsg unavailable to the container. */
725         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
726             asprintf(&to, "%s/proc/kmsg", dest) < 0)
727                 return log_oom();
728
729         if (mkfifo(from, 0600) < 0) {
730                 log_error("mkfifo() for /dev/kmsg failed: %m");
731                 return -errno;
732         }
733
734         r = chmod_and_chown(from, 0600, 0, 0);
735         if (r < 0) {
736                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
737                 return r;
738         }
739
740         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
741                 log_error("Bind mount for /proc/kmsg failed: %m");
742                 return -errno;
743         }
744
745         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
746         if (fd < 0) {
747                 log_error("Failed to open fifo: %m");
748                 return -errno;
749         }
750
751         cmsg = CMSG_FIRSTHDR(&mh);
752         cmsg->cmsg_level = SOL_SOCKET;
753         cmsg->cmsg_type = SCM_RIGHTS;
754         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
755         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
756
757         mh.msg_controllen = cmsg->cmsg_len;
758
759         /* Store away the fd in the socket, so that it stays open as
760          * long as we run the child */
761         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
762         close_nointr_nofail(fd);
763
764         if (k < 0) {
765                 log_error("Failed to send FIFO fd: %m");
766                 return -errno;
767         }
768
769         /* And now make the FIFO unavailable as /dev/kmsg... */
770         unlink(from);
771         return 0;
772 }
773
774 static int setup_hostname(void) {
775
776         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
777                 return -errno;
778
779         return 0;
780 }
781
782 static int setup_journal(const char *directory) {
783         sd_id128_t machine_id;
784         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
785         char *id;
786         int r;
787
788         if (arg_link_journal == LINK_NO)
789                 return 0;
790
791         p = strappend(directory, "/etc/machine-id");
792         if (!p)
793                 return log_oom();
794
795         r = read_one_line_file(p, &b);
796         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
797                 return 0;
798         else if (r < 0) {
799                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
800                 return r;
801         }
802
803         id = strstrip(b);
804         if (isempty(id) && arg_link_journal == LINK_AUTO)
805                 return 0;
806
807         /* Verify validity */
808         r = sd_id128_from_string(id, &machine_id);
809         if (r < 0) {
810                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
811                 return r;
812         }
813
814         free(p);
815         p = strappend("/var/log/journal/", id);
816         q = strjoin(directory, "/var/log/journal/", id, NULL);
817         if (!p || !q)
818                 return log_oom();
819
820         if (path_is_mount_point(p, false) > 0) {
821                 if (arg_link_journal != LINK_AUTO) {
822                         log_error("%s: already a mount point, refusing to use for journal", p);
823                         return -EEXIST;
824                 }
825
826                 return 0;
827         }
828
829         if (path_is_mount_point(q, false) > 0) {
830                 if (arg_link_journal != LINK_AUTO) {
831                         log_error("%s: already a mount point, refusing to use for journal", q);
832                         return -EEXIST;
833                 }
834
835                 return 0;
836         }
837
838         r = readlink_and_make_absolute(p, &d);
839         if (r >= 0) {
840                 if ((arg_link_journal == LINK_GUEST ||
841                      arg_link_journal == LINK_AUTO) &&
842                     path_equal(d, q)) {
843
844                         r = mkdir_p(q, 0755);
845                         if (r < 0)
846                                 log_warning("failed to create directory %s: %m", q);
847                         return 0;
848                 }
849
850                 if (unlink(p) < 0) {
851                         log_error("Failed to remove symlink %s: %m", p);
852                         return -errno;
853                 }
854         } else if (r == -EINVAL) {
855
856                 if (arg_link_journal == LINK_GUEST &&
857                     rmdir(p) < 0) {
858
859                         if (errno == ENOTDIR) {
860                                 log_error("%s already exists and is neither a symlink nor a directory", p);
861                                 return r;
862                         } else {
863                                 log_error("Failed to remove %s: %m", p);
864                                 return -errno;
865                         }
866                 }
867         } else if (r != -ENOENT) {
868                 log_error("readlink(%s) failed: %m", p);
869                 return r;
870         }
871
872         if (arg_link_journal == LINK_GUEST) {
873
874                 if (symlink(q, p) < 0) {
875                         log_error("Failed to symlink %s to %s: %m", q, p);
876                         return -errno;
877                 }
878
879                 r = mkdir_p(q, 0755);
880                 if (r < 0)
881                         log_warning("failed to create directory %s: %m", q);
882                 return 0;
883         }
884
885         if (arg_link_journal == LINK_HOST) {
886                 r = mkdir_p(p, 0755);
887                 if (r < 0) {
888                         log_error("Failed to create %s: %m", p);
889                         return r;
890                 }
891
892         } else if (access(p, F_OK) < 0)
893                 return 0;
894
895         if (dir_is_empty(q) == 0) {
896                 log_error("%s not empty.", q);
897                 return -ENOTEMPTY;
898         }
899
900         r = mkdir_p(q, 0755);
901         if (r < 0) {
902                 log_error("Failed to create %s: %m", q);
903                 return r;
904         }
905
906         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
907                 log_error("Failed to bind mount journal from host into guest: %m");
908                 return -errno;
909         }
910
911         return 0;
912 }
913
914 static int setup_cgroup(const char *path) {
915         char **c;
916         int r;
917
918         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
919         if (r < 0) {
920                 log_error("Failed to create cgroup: %s", strerror(-r));
921                 return r;
922         }
923
924         STRV_FOREACH(c, arg_controllers) {
925                 r = cg_create_and_attach(*c, path, 1);
926                 if (r < 0)
927                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
928         }
929
930         return 0;
931 }
932
933 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
934 #ifdef HAVE_XATTR
935         _cleanup_free_ char *path = NULL;
936         char buf[DECIMAL_STR_MAX(pid_t)];
937         int r = 0, k;
938
939         assert(cgroup);
940         assert(pid >= 0);
941         assert(arg_directory);
942
943         assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
944
945         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
946         if (r < 0) {
947                 log_error("Failed to get path: %s", strerror(-r));
948                 return r;
949         }
950
951         r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
952         if (r < 0)
953                 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
954
955         if (uuid) {
956                 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
957                 if (k < 0) {
958                         log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
959                         if (r == 0)
960                                 r = k;
961                 }
962         }
963
964         k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
965         if (k < 0) {
966                 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
967                 if (r == 0)
968                         r = k;
969         }
970         return r;
971 #else
972         return 0;
973 #endif
974 }
975
976 static int drop_capabilities(void) {
977         return capability_bounding_set_drop(~arg_retain, false);
978 }
979
980 static int process_pty(int master, pid_t pid, sigset_t *mask) {
981
982         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
983         size_t in_buffer_full = 0, out_buffer_full = 0;
984         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
985         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
986         int ep = -1, signal_fd = -1, r;
987         bool tried_orderly_shutdown = false;
988
989         assert(master >= 0);
990         assert(pid > 0);
991         assert(mask);
992
993         fd_nonblock(STDIN_FILENO, 1);
994         fd_nonblock(STDOUT_FILENO, 1);
995         fd_nonblock(master, 1);
996
997         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
998         if (signal_fd < 0) {
999                 log_error("signalfd(): %m");
1000                 r = -errno;
1001                 goto finish;
1002         }
1003
1004         ep = epoll_create1(EPOLL_CLOEXEC);
1005         if (ep < 0) {
1006                 log_error("Failed to create epoll: %m");
1007                 r = -errno;
1008                 goto finish;
1009         }
1010
1011         /* We read from STDIN only if this is actually a TTY,
1012          * otherwise we assume non-interactivity. */
1013         if (isatty(STDIN_FILENO)) {
1014                 zero(stdin_ev);
1015                 stdin_ev.events = EPOLLIN|EPOLLET;
1016                 stdin_ev.data.fd = STDIN_FILENO;
1017
1018                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1019                         log_error("Failed to register STDIN in epoll: %m");
1020                         r = -errno;
1021                         goto finish;
1022                 }
1023         }
1024
1025         zero(stdout_ev);
1026         stdout_ev.events = EPOLLOUT|EPOLLET;
1027         stdout_ev.data.fd = STDOUT_FILENO;
1028
1029         zero(master_ev);
1030         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1031         master_ev.data.fd = master;
1032
1033         zero(signal_ev);
1034         signal_ev.events = EPOLLIN;
1035         signal_ev.data.fd = signal_fd;
1036
1037         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1038                 if (errno != EPERM) {
1039                         log_error("Failed to register stdout in epoll: %m");
1040                         r = -errno;
1041                         goto finish;
1042                 }
1043                 /* stdout without epoll support. Likely redirected to regular file. */
1044                 stdout_writable = true;
1045         }
1046
1047         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1048             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1049                 log_error("Failed to register fds in epoll: %m");
1050                 r = -errno;
1051                 goto finish;
1052         }
1053
1054         for (;;) {
1055                 struct epoll_event ev[16];
1056                 ssize_t k;
1057                 int i, nfds;
1058
1059                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1060                 if (nfds < 0) {
1061
1062                         if (errno == EINTR || errno == EAGAIN)
1063                                 continue;
1064
1065                         log_error("epoll_wait(): %m");
1066                         r = -errno;
1067                         goto finish;
1068                 }
1069
1070                 assert(nfds >= 1);
1071
1072                 for (i = 0; i < nfds; i++) {
1073                         if (ev[i].data.fd == STDIN_FILENO) {
1074
1075                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1076                                         stdin_readable = true;
1077
1078                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1079
1080                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1081                                         stdout_writable = true;
1082
1083                         } else if (ev[i].data.fd == master) {
1084
1085                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1086                                         master_readable = true;
1087
1088                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1089                                         master_writable = true;
1090
1091                         } else if (ev[i].data.fd == signal_fd) {
1092                                 struct signalfd_siginfo sfsi;
1093                                 ssize_t n;
1094
1095                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1096                                 if (n != sizeof(sfsi)) {
1097
1098                                         if (n >= 0) {
1099                                                 log_error("Failed to read from signalfd: invalid block size");
1100                                                 r = -EIO;
1101                                                 goto finish;
1102                                         }
1103
1104                                         if (errno != EINTR && errno != EAGAIN) {
1105                                                 log_error("Failed to read from signalfd: %m");
1106                                                 r = -errno;
1107                                                 goto finish;
1108                                         }
1109                                 } else {
1110
1111                                         if (sfsi.ssi_signo == SIGWINCH) {
1112                                                 struct winsize ws;
1113
1114                                                 /* The window size changed, let's forward that. */
1115                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1116                                                         ioctl(master, TIOCSWINSZ, &ws);
1117                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1118
1119                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1120
1121                                                 /* This only works for systemd... */
1122                                                 tried_orderly_shutdown = true;
1123                                                 kill(pid, SIGRTMIN+3);
1124
1125                                         } else {
1126                                                 r = 0;
1127                                                 goto finish;
1128                                         }
1129                                 }
1130                         }
1131                 }
1132
1133                 while ((stdin_readable && in_buffer_full <= 0) ||
1134                        (master_writable && in_buffer_full > 0) ||
1135                        (master_readable && out_buffer_full <= 0) ||
1136                        (stdout_writable && out_buffer_full > 0)) {
1137
1138                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1139
1140                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1141                                 if (k < 0) {
1142
1143                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1144                                                 stdin_readable = false;
1145                                         else {
1146                                                 log_error("read(): %m");
1147                                                 r = -errno;
1148                                                 goto finish;
1149                                         }
1150                                 } else
1151                                         in_buffer_full += (size_t) k;
1152                         }
1153
1154                         if (master_writable && in_buffer_full > 0) {
1155
1156                                 k = write(master, in_buffer, in_buffer_full);
1157                                 if (k < 0) {
1158
1159                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1160                                                 master_writable = false;
1161                                         else {
1162                                                 log_error("write(): %m");
1163                                                 r = -errno;
1164                                                 goto finish;
1165                                         }
1166
1167                                 } else {
1168                                         assert(in_buffer_full >= (size_t) k);
1169                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1170                                         in_buffer_full -= k;
1171                                 }
1172                         }
1173
1174                         if (master_readable && out_buffer_full < LINE_MAX) {
1175
1176                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1177                                 if (k < 0) {
1178
1179                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1180                                                 master_readable = false;
1181                                         else {
1182                                                 log_error("read(): %m");
1183                                                 r = -errno;
1184                                                 goto finish;
1185                                         }
1186                                 }  else
1187                                         out_buffer_full += (size_t) k;
1188                         }
1189
1190                         if (stdout_writable && out_buffer_full > 0) {
1191
1192                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1193                                 if (k < 0) {
1194
1195                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1196                                                 stdout_writable = false;
1197                                         else {
1198                                                 log_error("write(): %m");
1199                                                 r = -errno;
1200                                                 goto finish;
1201                                         }
1202
1203                                 } else {
1204                                         assert(out_buffer_full >= (size_t) k);
1205                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1206                                         out_buffer_full -= k;
1207                                 }
1208                         }
1209                 }
1210         }
1211
1212 finish:
1213         if (ep >= 0)
1214                 close_nointr_nofail(ep);
1215
1216         if (signal_fd >= 0)
1217                 close_nointr_nofail(signal_fd);
1218
1219         return r;
1220 }
1221
1222 int main(int argc, char *argv[]) {
1223         pid_t pid = 0;
1224         int r = EXIT_FAILURE, k;
1225         _cleanup_free_ char *newcg = NULL;
1226         _cleanup_close_ int master = -1;
1227         int n_fd_passed;
1228         const char *console = NULL;
1229         struct termios saved_attr, raw_attr;
1230         sigset_t mask;
1231         bool saved_attr_valid = false;
1232         struct winsize ws;
1233         int kmsg_socket_pair[2] = { -1, -1 };
1234         FDSet *fds = NULL;
1235
1236         log_parse_environment();
1237         log_open();
1238
1239         k = parse_argv(argc, argv);
1240         if (k < 0)
1241                 goto finish;
1242         else if (k == 0) {
1243                 r = EXIT_SUCCESS;
1244                 goto finish;
1245         }
1246
1247         if (arg_directory) {
1248                 char *p;
1249
1250                 p = path_make_absolute_cwd(arg_directory);
1251                 free(arg_directory);
1252                 arg_directory = p;
1253         } else
1254                 arg_directory = get_current_dir_name();
1255
1256         if (!arg_directory) {
1257                 log_error("Failed to determine path, please use -D.");
1258                 goto finish;
1259         }
1260
1261         path_kill_slashes(arg_directory);
1262
1263         if (!arg_machine) {
1264                 arg_machine = strdup(path_get_file_name(arg_directory));
1265                 if (!arg_machine) {
1266                         log_oom();
1267                         goto finish;
1268                 }
1269
1270                 hostname_cleanup(arg_machine);
1271                 if (isempty(arg_machine)) {
1272                         log_error("Failed to determine machine name automatically, please use -M.");
1273                         goto finish;
1274                 }
1275         }
1276
1277         if (geteuid() != 0) {
1278                 log_error("Need to be root.");
1279                 goto finish;
1280         }
1281
1282         if (sd_booted() <= 0) {
1283                 log_error("Not running on a systemd system.");
1284                 goto finish;
1285         }
1286
1287         if (path_equal(arg_directory, "/")) {
1288                 log_error("Spawning container on root directory not supported.");
1289                 goto finish;
1290         }
1291
1292         if (path_is_os_tree(arg_directory) <= 0) {
1293                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1294                 goto finish;
1295         }
1296
1297         log_close();
1298         n_fd_passed = sd_listen_fds(false);
1299         if (n_fd_passed > 0) {
1300                 k = fdset_new_listen_fds(&fds, false);
1301                 if (k < 0) {
1302                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1303                         goto finish;
1304                 }
1305         }
1306         fdset_close_others(fds);
1307         log_open();
1308
1309         k = cg_get_machine_path(arg_machine, &newcg);
1310         if (k < 0) {
1311                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1312                 goto finish;
1313         }
1314
1315         k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1316         if (k <= 0 && k != -ENOENT) {
1317                 log_error("Container already running.");
1318
1319                 free(newcg);
1320                 newcg = NULL;
1321
1322                 goto finish;
1323         }
1324
1325         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1326         if (master < 0) {
1327                 log_error("Failed to acquire pseudo tty: %m");
1328                 goto finish;
1329         }
1330
1331         console = ptsname(master);
1332         if (!console) {
1333                 log_error("Failed to determine tty name: %m");
1334                 goto finish;
1335         }
1336
1337         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1338
1339         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1340                 ioctl(master, TIOCSWINSZ, &ws);
1341
1342         if (unlockpt(master) < 0) {
1343                 log_error("Failed to unlock tty: %m");
1344                 goto finish;
1345         }
1346
1347         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1348                 saved_attr_valid = true;
1349
1350                 raw_attr = saved_attr;
1351                 cfmakeraw(&raw_attr);
1352                 raw_attr.c_lflag &= ~ECHO;
1353         }
1354
1355         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1356                 log_error("Failed to create kmsg socket pair.");
1357                 goto finish;
1358         }
1359
1360         sd_notify(0, "READY=1");
1361
1362         assert_se(sigemptyset(&mask) == 0);
1363         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1364         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1365
1366         for (;;) {
1367                 siginfo_t status;
1368                 int pipefd[2], pipefd2[2];
1369
1370                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1371                         log_error("pipe2(): %m");
1372                         goto finish;
1373                 }
1374
1375                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1376                         log_error("pipe2(): %m");
1377                         close_pipe(pipefd);
1378                         goto finish;
1379                 }
1380
1381                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1382                 if (pid < 0) {
1383                         if (errno == EINVAL)
1384                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1385                         else
1386                                 log_error("clone() failed: %m");
1387
1388                         goto finish;
1389                 }
1390
1391                 if (pid == 0) {
1392                         /* child */
1393                         const char *home = NULL;
1394                         uid_t uid = (uid_t) -1;
1395                         gid_t gid = (gid_t) -1;
1396                         unsigned n_env = 2;
1397                         const char *envp[] = {
1398                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1399                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1400                                 NULL, /* TERM */
1401                                 NULL, /* HOME */
1402                                 NULL, /* USER */
1403                                 NULL, /* LOGNAME */
1404                                 NULL, /* container_uuid */
1405                                 NULL, /* LISTEN_FDS */
1406                                 NULL, /* LISTEN_PID */
1407                                 NULL
1408                         };
1409
1410                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1411                         if (envp[n_env])
1412                                 n_env ++;
1413
1414                         /* Wait for the parent process to log our PID */
1415                         close_nointr_nofail(pipefd[1]);
1416                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1417                         close_nointr_nofail(pipefd[0]);
1418
1419                         close_nointr_nofail(master);
1420                         master = -1;
1421
1422                         if (saved_attr_valid) {
1423                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1424                                         log_error("Failed to set terminal attributes: %m");
1425                                         goto child_fail;
1426                                 }
1427                         }
1428
1429                         close_nointr(STDIN_FILENO);
1430                         close_nointr(STDOUT_FILENO);
1431                         close_nointr(STDERR_FILENO);
1432
1433                         close_nointr_nofail(kmsg_socket_pair[0]);
1434                         kmsg_socket_pair[0] = -1;
1435
1436                         reset_all_signal_handlers();
1437
1438                         assert_se(sigemptyset(&mask) == 0);
1439                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1440
1441                         k = open_terminal(console, O_RDWR);
1442                         if (k != STDIN_FILENO) {
1443                                 if (k >= 0) {
1444                                         close_nointr_nofail(k);
1445                                         k = -EINVAL;
1446                                 }
1447
1448                                 log_error("Failed to open console: %s", strerror(-k));
1449                                 goto child_fail;
1450                         }
1451
1452                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1453                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1454                                 log_error("Failed to duplicate console: %m");
1455                                 goto child_fail;
1456                         }
1457
1458                         if (setsid() < 0) {
1459                                 log_error("setsid() failed: %m");
1460                                 goto child_fail;
1461                         }
1462
1463                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1464                                 log_error("PR_SET_PDEATHSIG failed: %m");
1465                                 goto child_fail;
1466                         }
1467
1468                         if (setup_cgroup(newcg) < 0)
1469                                 goto child_fail;
1470
1471                         close_pipe(pipefd2);
1472
1473                         /* Mark everything as slave, so that we still
1474                          * receive mounts from the real root, but don't
1475                          * propagate mounts to the real root. */
1476                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1477                                 log_error("MS_SLAVE|MS_REC failed: %m");
1478                                 goto child_fail;
1479                         }
1480
1481                         /* Turn directory into bind mount */
1482                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1483                                 log_error("Failed to make bind mount.");
1484                                 goto child_fail;
1485                         }
1486
1487                         if (arg_read_only)
1488                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1489                                         log_error("Failed to make read-only.");
1490                                         goto child_fail;
1491                                 }
1492
1493                         if (mount_all(arg_directory) < 0)
1494                                 goto child_fail;
1495
1496                         if (copy_devnodes(arg_directory) < 0)
1497                                 goto child_fail;
1498
1499                         if (setup_ptmx(arg_directory) < 0)
1500                                 goto child_fail;
1501
1502                         dev_setup(arg_directory);
1503
1504                         if (setup_dev_console(arg_directory, console) < 0)
1505                                 goto child_fail;
1506
1507                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1508                                 goto child_fail;
1509
1510                         close_nointr_nofail(kmsg_socket_pair[1]);
1511                         kmsg_socket_pair[1] = -1;
1512
1513                         if (setup_boot_id(arg_directory) < 0)
1514                                 goto child_fail;
1515
1516                         if (setup_timezone(arg_directory) < 0)
1517                                 goto child_fail;
1518
1519                         if (setup_resolv_conf(arg_directory) < 0)
1520                                 goto child_fail;
1521
1522                         if (setup_journal(arg_directory) < 0)
1523                                 goto child_fail;
1524
1525                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1526                                 goto child_fail;
1527
1528                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1529                                 goto child_fail;
1530
1531                         if (chdir(arg_directory) < 0) {
1532                                 log_error("chdir(%s) failed: %m", arg_directory);
1533                                 goto child_fail;
1534                         }
1535
1536                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1537                                 log_error("mount(MS_MOVE) failed: %m");
1538                                 goto child_fail;
1539                         }
1540
1541                         if (chroot(".") < 0) {
1542                                 log_error("chroot() failed: %m");
1543                                 goto child_fail;
1544                         }
1545
1546                         if (chdir("/") < 0) {
1547                                 log_error("chdir() failed: %m");
1548                                 goto child_fail;
1549                         }
1550
1551                         umask(0022);
1552
1553                         loopback_setup();
1554
1555                         if (drop_capabilities() < 0) {
1556                                 log_error("drop_capabilities() failed: %m");
1557                                 goto child_fail;
1558                         }
1559
1560                         if (arg_user) {
1561
1562                                 /* Note that this resolves user names
1563                                  * inside the container, and hence
1564                                  * accesses the NSS modules from the
1565                                  * container and not the host. This is
1566                                  * a bit weird... */
1567
1568                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1569                                         log_error("get_user_creds() failed: %m");
1570                                         goto child_fail;
1571                                 }
1572
1573                                 if (mkdir_parents_label(home, 0775) < 0) {
1574                                         log_error("mkdir_parents_label() failed: %m");
1575                                         goto child_fail;
1576                                 }
1577
1578                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1579                                         log_error("mkdir_safe_label() failed: %m");
1580                                         goto child_fail;
1581                                 }
1582
1583                                 if (initgroups((const char*)arg_user, gid) < 0) {
1584                                         log_error("initgroups() failed: %m");
1585                                         goto child_fail;
1586                                 }
1587
1588                                 if (setresgid(gid, gid, gid) < 0) {
1589                                         log_error("setregid() failed: %m");
1590                                         goto child_fail;
1591                                 }
1592
1593                                 if (setresuid(uid, uid, uid) < 0) {
1594                                         log_error("setreuid() failed: %m");
1595                                         goto child_fail;
1596                                 }
1597                         } else {
1598                                 /* Reset everything fully to 0, just in case */
1599
1600                                 if (setgroups(0, NULL) < 0) {
1601                                         log_error("setgroups() failed: %m");
1602                                         goto child_fail;
1603                                 }
1604
1605                                 if (setresgid(0, 0, 0) < 0) {
1606                                         log_error("setregid() failed: %m");
1607                                         goto child_fail;
1608                                 }
1609
1610                                 if (setresuid(0, 0, 0) < 0) {
1611                                         log_error("setreuid() failed: %m");
1612                                         goto child_fail;
1613                                 }
1614                         }
1615
1616                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1617                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1618                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1619                                 log_oom();
1620                                 goto child_fail;
1621                         }
1622
1623                         if (arg_uuid) {
1624                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1625                                         log_oom();
1626                                         goto child_fail;
1627                                 }
1628                         }
1629
1630                         if (fdset_size(fds) > 0) {
1631                                 k = fdset_cloexec(fds, false);
1632                                 if (k < 0) {
1633                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1634                                         goto child_fail;
1635                                 }
1636
1637                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1638                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1639                                         log_oom();
1640                                         goto child_fail;
1641                                 }
1642                         }
1643
1644                         setup_hostname();
1645
1646                         if (arg_boot) {
1647                                 char **a;
1648                                 size_t l;
1649
1650                                 /* Automatically search for the init system */
1651
1652                                 l = 1 + argc - optind;
1653                                 a = newa(char*, l + 1);
1654                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1655
1656                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1657                                 execve(a[0], a, (char**) envp);
1658
1659                                 a[0] = (char*) "/lib/systemd/systemd";
1660                                 execve(a[0], a, (char**) envp);
1661
1662                                 a[0] = (char*) "/sbin/init";
1663                                 execve(a[0], a, (char**) envp);
1664                         } else if (argc > optind)
1665                                 execvpe(argv[optind], argv + optind, (char**) envp);
1666                         else {
1667                                 chdir(home ? home : "/root");
1668                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1669                         }
1670
1671                         log_error("execv() failed: %m");
1672
1673                 child_fail:
1674                         _exit(EXIT_FAILURE);
1675                 }
1676
1677                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1678                 close_nointr_nofail(pipefd[0]);
1679                 close_nointr_nofail(pipefd[1]);
1680
1681                 /* Wait for the child process to establish cgroup hierarchy */
1682                 close_nointr_nofail(pipefd2[1]);
1683                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1684                 close_nointr_nofail(pipefd2[0]);
1685
1686                 save_attributes(newcg, pid, arg_uuid, arg_directory);
1687
1688                 fdset_free(fds);
1689                 fds = NULL;
1690
1691                 if (process_pty(master, pid, &mask) < 0)
1692                         goto finish;
1693
1694                 if (saved_attr_valid)
1695                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1696
1697                 k = wait_for_terminate(pid, &status);
1698                 if (k < 0) {
1699                         r = EXIT_FAILURE;
1700                         break;
1701                 }
1702
1703                 if (status.si_code == CLD_EXITED) {
1704                         r = status.si_status;
1705                         if (status.si_status != 0) {
1706                                 log_error("Container failed with error code %i.", status.si_status);
1707                                 break;
1708                         }
1709
1710                         log_debug("Container exited successfully.");
1711                         break;
1712                 } else if (status.si_code == CLD_KILLED &&
1713                            status.si_status == SIGINT) {
1714                         log_info("Container has been shut down.");
1715                         r = 0;
1716                         break;
1717                 } else if (status.si_code == CLD_KILLED &&
1718                            status.si_status == SIGHUP) {
1719                         log_info("Container is being rebooted.");
1720                         continue;
1721                 } else if (status.si_code == CLD_KILLED ||
1722                            status.si_code == CLD_DUMPED) {
1723
1724                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1725                         r = EXIT_FAILURE;
1726                         break;
1727                 } else {
1728                         log_error("Container failed due to unknown reason.");
1729                         r = EXIT_FAILURE;
1730                         break;
1731                 }
1732         }
1733
1734 finish:
1735         if (saved_attr_valid)
1736                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1737
1738         close_pipe(kmsg_socket_pair);
1739
1740         if (newcg)
1741                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1742
1743         free(arg_directory);
1744         free(arg_machine);
1745         strv_free(arg_controllers);
1746
1747         fdset_free(fds);
1748
1749         return r;
1750 }