chiark / gitweb /
a49cbc2238cf142de9adcd6b0929407794e2a3bb
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <attr/xattr.h>
26 #include <sys/types.h>
27 #include <sys/syscall.h>
28 #include <sys/mount.h>
29 #include <sys/wait.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <stdio.h>
33 #include <errno.h>
34 #include <sys/prctl.h>
35 #include <sys/capability.h>
36 #include <getopt.h>
37 #include <sys/poll.h>
38 #include <sys/epoll.h>
39 #include <termios.h>
40 #include <sys/signalfd.h>
41 #include <grp.h>
42 #include <linux/fs.h>
43 #include <sys/un.h>
44 #include <sys/socket.h>
45
46 #include <systemd/sd-daemon.h>
47
48 #include "log.h"
49 #include "util.h"
50 #include "mkdir.h"
51 #include "macro.h"
52 #include "audit.h"
53 #include "missing.h"
54 #include "cgroup-util.h"
55 #include "strv.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
58 #include "sd-id128.h"
59 #include "dev-setup.h"
60 #include "fdset.h"
61 #include "build.h"
62 #include "fileio.h"
63
64 #ifndef TTY_GID
65 #define TTY_GID 5
66 #endif
67
68 typedef enum LinkJournal {
69         LINK_NO,
70         LINK_AUTO,
71         LINK_HOST,
72         LINK_GUEST
73 } LinkJournal;
74
75 static char *arg_directory = NULL;
76 static char *arg_user = NULL;
77 static char **arg_controllers = NULL;
78 static char *arg_uuid = NULL;
79 static char *arg_machine = NULL;
80 static bool arg_private_network = false;
81 static bool arg_read_only = false;
82 static bool arg_boot = false;
83 static LinkJournal arg_link_journal = LINK_AUTO;
84 static uint64_t arg_retain =
85         (1ULL << CAP_CHOWN) |
86         (1ULL << CAP_DAC_OVERRIDE) |
87         (1ULL << CAP_DAC_READ_SEARCH) |
88         (1ULL << CAP_FOWNER) |
89         (1ULL << CAP_FSETID) |
90         (1ULL << CAP_IPC_OWNER) |
91         (1ULL << CAP_KILL) |
92         (1ULL << CAP_LEASE) |
93         (1ULL << CAP_LINUX_IMMUTABLE) |
94         (1ULL << CAP_NET_BIND_SERVICE) |
95         (1ULL << CAP_NET_BROADCAST) |
96         (1ULL << CAP_NET_RAW) |
97         (1ULL << CAP_SETGID) |
98         (1ULL << CAP_SETFCAP) |
99         (1ULL << CAP_SETPCAP) |
100         (1ULL << CAP_SETUID) |
101         (1ULL << CAP_SYS_ADMIN) |
102         (1ULL << CAP_SYS_CHROOT) |
103         (1ULL << CAP_SYS_NICE) |
104         (1ULL << CAP_SYS_PTRACE) |
105         (1ULL << CAP_SYS_TTY_CONFIG) |
106         (1ULL << CAP_SYS_RESOURCE) |
107         (1ULL << CAP_SYS_BOOT) |
108         (1ULL << CAP_AUDIT_WRITE) |
109         (1ULL << CAP_AUDIT_CONTROL);
110 static char **arg_bind = NULL;
111 static char **arg_bind_ro = NULL;
112
113 static int help(void) {
114
115         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
116                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
117                "  -h --help                Show this help\n"
118                "     --version             Print version string\n"
119                "  -D --directory=NAME      Root directory for the container\n"
120                "  -b --boot                Boot up full system (i.e. invoke init)\n"
121                "  -u --user=USER           Run the command under specified user or uid\n"
122                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
123                "                           cgroup hierarchies\n"
124                "     --uuid=UUID           Set a specific machine UUID for the container\n"
125                "  -M --machine=NAME        Set the machine name for the container\n"
126                "     --private-network     Disable network in container\n"
127                "     --read-only           Mount the root directory read-only\n"
128                "     --capability=CAP      In addition to the default, retain specified\n"
129                "                           capability\n"
130                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
131                "  -j                       Equivalent to --link-journal=host\n"
132                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
133                "                           the container\n"
134                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135                program_invocation_short_name);
136
137         return 0;
138 }
139
140 static int parse_argv(int argc, char *argv[]) {
141
142         enum {
143                 ARG_VERSION = 0x100,
144                 ARG_PRIVATE_NETWORK,
145                 ARG_UUID,
146                 ARG_READ_ONLY,
147                 ARG_CAPABILITY,
148                 ARG_LINK_JOURNAL,
149                 ARG_BIND,
150                 ARG_BIND_RO
151         };
152
153         static const struct option options[] = {
154                 { "help",            no_argument,       NULL, 'h'                 },
155                 { "version",         no_argument,       NULL, ARG_VERSION         },
156                 { "directory",       required_argument, NULL, 'D'                 },
157                 { "user",            required_argument, NULL, 'u'                 },
158                 { "controllers",     required_argument, NULL, 'C'                 },
159                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
160                 { "boot",            no_argument,       NULL, 'b'                 },
161                 { "uuid",            required_argument, NULL, ARG_UUID            },
162                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
163                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
164                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
165                 { "bind",            required_argument, NULL, ARG_BIND            },
166                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
167                 { "machine",         required_argument, NULL, 'M'                 },
168                 { NULL,              0,                 NULL, 0                   }
169         };
170
171         int c;
172
173         assert(argc >= 0);
174         assert(argv);
175
176         while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
177
178                 switch (c) {
179
180                 case 'h':
181                         help();
182                         return 0;
183
184                 case ARG_VERSION:
185                         puts(PACKAGE_STRING);
186                         puts(SYSTEMD_FEATURES);
187                         return 0;
188
189                 case 'D':
190                         free(arg_directory);
191                         arg_directory = canonicalize_file_name(optarg);
192                         if (!arg_directory) {
193                                 log_error("Failed to canonicalize root directory.");
194                                 return -ENOMEM;
195                         }
196
197                         break;
198
199                 case 'u':
200                         free(arg_user);
201                         arg_user = strdup(optarg);
202                         if (!arg_user)
203                                 return log_oom();
204
205                         break;
206
207                 case 'C':
208                         strv_free(arg_controllers);
209                         arg_controllers = strv_split(optarg, ",");
210                         if (!arg_controllers)
211                                 return log_oom();
212
213                         cg_shorten_controllers(arg_controllers);
214                         break;
215
216                 case ARG_PRIVATE_NETWORK:
217                         arg_private_network = true;
218                         break;
219
220                 case 'b':
221                         arg_boot = true;
222                         break;
223
224                 case ARG_UUID:
225                         arg_uuid = optarg;
226                         break;
227
228                 case 'M':
229                         if (!hostname_is_valid(optarg)) {
230                                 log_error("Invalid machine name: %s", optarg);
231                                 return -EINVAL;
232                         }
233
234                         free(arg_machine);
235                         arg_machine = strdup(optarg);
236                         if (!arg_machine)
237                                 return log_oom();
238
239                         break;
240
241                 case ARG_READ_ONLY:
242                         arg_read_only = true;
243                         break;
244
245                 case ARG_CAPABILITY: {
246                         char *state, *word;
247                         size_t length;
248
249                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
250                                 cap_value_t cap;
251                                 char *t;
252
253                                 t = strndup(word, length);
254                                 if (!t)
255                                         return log_oom();
256
257                                 if (cap_from_name(t, &cap) < 0) {
258                                         log_error("Failed to parse capability %s.", t);
259                                         free(t);
260                                         return -EINVAL;
261                                 }
262
263                                 free(t);
264                                 arg_retain |= 1ULL << (uint64_t) cap;
265                         }
266
267                         break;
268                 }
269
270                 case 'j':
271                         arg_link_journal = LINK_GUEST;
272                         break;
273
274                 case ARG_LINK_JOURNAL:
275                         if (streq(optarg, "auto"))
276                                 arg_link_journal = LINK_AUTO;
277                         else if (streq(optarg, "no"))
278                                 arg_link_journal = LINK_NO;
279                         else if (streq(optarg, "guest"))
280                                 arg_link_journal = LINK_GUEST;
281                         else if (streq(optarg, "host"))
282                                 arg_link_journal = LINK_HOST;
283                         else {
284                                 log_error("Failed to parse link journal mode %s", optarg);
285                                 return -EINVAL;
286                         }
287
288                         break;
289
290                 case ARG_BIND:
291                 case ARG_BIND_RO: {
292                         _cleanup_free_ char *a = NULL, *b = NULL;
293                         char *e;
294                         char ***x;
295                         int r;
296
297                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299                         e = strchr(optarg, ':');
300                         if (e) {
301                                 a = strndup(optarg, e - optarg);
302                                 b = strdup(e + 1);
303                         } else {
304                                 a = strdup(optarg);
305                                 b = strdup(optarg);
306                         }
307
308                         if (!a || !b)
309                                 return log_oom();
310
311                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
312                                 log_error("Invalid bind mount specification: %s", optarg);
313                                 return -EINVAL;
314                         }
315
316                         r = strv_extend(x, a);
317                         if (r < 0)
318                                 return r;
319
320                         r = strv_extend(x, b);
321                         if (r < 0)
322                                 return r;
323
324                         break;
325                 }
326
327                 case '?':
328                         return -EINVAL;
329
330                 default:
331                         log_error("Unknown option code %c", c);
332                         return -EINVAL;
333                 }
334         }
335
336         return 1;
337 }
338
339 static int mount_all(const char *dest) {
340
341         typedef struct MountPoint {
342                 const char *what;
343                 const char *where;
344                 const char *type;
345                 const char *options;
346                 unsigned long flags;
347                 bool fatal;
348         } MountPoint;
349
350         static const MountPoint mount_table[] = {
351                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
352                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
353                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
354                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
355                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
356                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
359 #ifdef HAVE_SELINUX
360                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
361                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
362 #endif
363         };
364
365         unsigned k;
366         int r = 0;
367
368         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369                 _cleanup_free_ char *where = NULL;
370                 int t;
371
372                 where = strjoin(dest, "/", mount_table[k].where, NULL);
373                 if (!where)
374                         return log_oom();
375
376                 t = path_is_mount_point(where, true);
377                 if (t < 0) {
378                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
379
380                         if (r == 0)
381                                 r = t;
382
383                         continue;
384                 }
385
386                 /* Skip this entry if it is not a remount. */
387                 if (mount_table[k].what && t > 0)
388                         continue;
389
390                 mkdir_p(where, 0755);
391
392                 if (mount(mount_table[k].what,
393                           where,
394                           mount_table[k].type,
395                           mount_table[k].flags,
396                           mount_table[k].options) < 0 &&
397                     mount_table[k].fatal) {
398
399                         log_error("mount(%s) failed: %m", where);
400
401                         if (r == 0)
402                                 r = -errno;
403                 }
404         }
405
406         return r;
407 }
408
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
410         char **x, **y;
411
412         STRV_FOREACH_PAIR(x, y, l) {
413                 _cleanup_free_ char *where = NULL;
414
415                 where = strjoin(dest, "/", *y, NULL);
416                 if (!where)
417                         return log_oom();
418
419                 mkdir_p_label(where, 0755);
420
421                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
422                         log_error("mount(%s) failed: %m", where);
423                         return -errno;
424                 }
425
426                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
427                         log_error("mount(%s) failed: %m", where);
428                         return -errno;
429                 }
430         }
431
432         return 0;
433 }
434
435 static int setup_timezone(const char *dest) {
436         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
437         char *z, *y;
438         int r;
439
440         assert(dest);
441
442         /* Fix the timezone, if possible */
443         r = readlink_malloc("/etc/localtime", &p);
444         if (r < 0) {
445                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
446                 return 0;
447         }
448
449         z = path_startswith(p, "../usr/share/zoneinfo/");
450         if (!z)
451                 z = path_startswith(p, "/usr/share/zoneinfo/");
452         if (!z) {
453                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
454                 return 0;
455         }
456
457         where = strappend(dest, "/etc/localtime");
458         if (!where)
459                 return log_oom();
460
461         r = readlink_malloc(where, &q);
462         if (r >= 0) {
463                 y = path_startswith(q, "../usr/share/zoneinfo/");
464                 if (!y)
465                         y = path_startswith(q, "/usr/share/zoneinfo/");
466
467
468                 /* Already pointing to the right place? Then do nothing .. */
469                 if (y && streq(y, z))
470                         return 0;
471         }
472
473         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
474         if (!check)
475                 return log_oom();
476
477         if (access(check, F_OK) < 0) {
478                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
479                 return 0;
480         }
481
482         what = strappend("../usr/share/zoneinfo/", z);
483         if (!what)
484                 return log_oom();
485
486         unlink(where);
487         if (symlink(what, where) < 0) {
488                 log_error("Failed to correct timezone of container: %m");
489                 return 0;
490         }
491
492         return 0;
493 }
494
495 static int setup_resolv_conf(const char *dest) {
496         char _cleanup_free_ *where = NULL;
497         _cleanup_close_ int fd = -1;
498
499         assert(dest);
500
501         if (arg_private_network)
502                 return 0;
503
504         /* Fix resolv.conf, if possible */
505         where = strappend(dest, "/etc/resolv.conf");
506         if (!where)
507                 return log_oom();
508
509         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
510
511         /* We don't really care for the results of this really. If it
512          * fails, it fails, but meh... */
513         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
514                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
515         else
516                 if (mount("/etc/resolv.conf", where, "bind",
517                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
518                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
519                         return -errno;
520                 }
521
522         return 0;
523 }
524
525 static int setup_boot_id(const char *dest) {
526         _cleanup_free_ char *from = NULL, *to = NULL;
527         sd_id128_t rnd;
528         char as_uuid[37];
529         int r;
530
531         assert(dest);
532
533         /* Generate a new randomized boot ID, so that each boot-up of
534          * the container gets a new one */
535
536         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
537         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
538         if (!from || !to)
539                 return log_oom();
540
541         r = sd_id128_randomize(&rnd);
542         if (r < 0) {
543                 log_error("Failed to generate random boot id: %s", strerror(-r));
544                 return r;
545         }
546
547         snprintf(as_uuid, sizeof(as_uuid),
548                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
549                  SD_ID128_FORMAT_VAL(rnd));
550         char_array_0(as_uuid);
551
552         r = write_string_file(from, as_uuid);
553         if (r < 0) {
554                 log_error("Failed to write boot id: %s", strerror(-r));
555                 return r;
556         }
557
558         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
559                 log_error("Failed to bind mount boot id: %m");
560                 r = -errno;
561         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
562                 log_warning("Failed to make boot id read-only: %m");
563
564         unlink(from);
565         return r;
566 }
567
568 static int copy_devnodes(const char *dest) {
569
570         static const char devnodes[] =
571                 "null\0"
572                 "zero\0"
573                 "full\0"
574                 "random\0"
575                 "urandom\0"
576                 "tty\0";
577
578         const char *d;
579         int r = 0;
580         _cleanup_umask_ mode_t u;
581
582         assert(dest);
583
584         u = umask(0000);
585
586         NULSTR_FOREACH(d, devnodes) {
587                 struct stat st;
588                 _cleanup_free_ char *from = NULL, *to = NULL;
589
590                 asprintf(&from, "/dev/%s", d);
591                 asprintf(&to, "%s/dev/%s", dest, d);
592
593                 if (!from || !to) {
594                         log_oom();
595
596                         if (r == 0)
597                                 r = -ENOMEM;
598
599                         break;
600                 }
601
602                 if (stat(from, &st) < 0) {
603
604                         if (errno != ENOENT) {
605                                 log_error("Failed to stat %s: %m", from);
606                                 if (r == 0)
607                                         r = -errno;
608                         }
609
610                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
611
612                         log_error("%s is not a char or block device, cannot copy", from);
613                         if (r == 0)
614                                 r = -EIO;
615
616                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
617
618                         log_error("mknod(%s) failed: %m", dest);
619                         if (r == 0)
620                                 r = -errno;
621                 }
622         }
623
624         return r;
625 }
626
627 static int setup_ptmx(const char *dest) {
628         _cleanup_free_ char *p = NULL;
629
630         p = strappend(dest, "/dev/ptmx");
631         if (!p)
632                 return log_oom();
633
634         if (symlink("pts/ptmx", p) < 0) {
635                 log_error("Failed to create /dev/ptmx symlink: %m");
636                 return -errno;
637         }
638
639         return 0;
640 }
641
642 static int setup_dev_console(const char *dest, const char *console) {
643         struct stat st;
644         _cleanup_free_ char *to = NULL;
645         int r;
646         _cleanup_umask_ mode_t u;
647
648         assert(dest);
649         assert(console);
650
651         u = umask(0000);
652
653         if (stat(console, &st) < 0) {
654                 log_error("Failed to stat %s: %m", console);
655                 return -errno;
656
657         } else if (!S_ISCHR(st.st_mode)) {
658                 log_error("/dev/console is not a char device");
659                 return -EIO;
660         }
661
662         r = chmod_and_chown(console, 0600, 0, 0);
663         if (r < 0) {
664                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
665                 return r;
666         }
667
668         if (asprintf(&to, "%s/dev/console", dest) < 0)
669                 return log_oom();
670
671         /* We need to bind mount the right tty to /dev/console since
672          * ptys can only exist on pts file systems. To have something
673          * to bind mount things on we create a device node first, that
674          * has the right major/minor (note that the major minor
675          * doesn't actually matter here, since we mount it over
676          * anyway). */
677
678         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
679                 log_error("mknod() for /dev/console failed: %m");
680                 return -errno;
681         }
682
683         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
684                 log_error("Bind mount for /dev/console failed: %m");
685                 return -errno;
686         }
687
688         return 0;
689 }
690
691 static int setup_kmsg(const char *dest, int kmsg_socket) {
692         _cleanup_free_ char *from = NULL, *to = NULL;
693         int r, fd, k;
694         _cleanup_umask_ mode_t u;
695         union {
696                 struct cmsghdr cmsghdr;
697                 uint8_t buf[CMSG_SPACE(sizeof(int))];
698         } control = {};
699         struct msghdr mh = {
700                 .msg_control = &control,
701                 .msg_controllen = sizeof(control),
702         };
703         struct cmsghdr *cmsg;
704
705         assert(dest);
706         assert(kmsg_socket >= 0);
707
708         u = umask(0000);
709
710         /* We create the kmsg FIFO as /dev/kmsg, but immediately
711          * delete it after bind mounting it to /proc/kmsg. While FIFOs
712          * on the reading side behave very similar to /proc/kmsg,
713          * their writing side behaves differently from /dev/kmsg in
714          * that writing blocks when nothing is reading. In order to
715          * avoid any problems with containers deadlocking due to this
716          * we simply make /dev/kmsg unavailable to the container. */
717         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
718             asprintf(&to, "%s/proc/kmsg", dest) < 0)
719                 return log_oom();
720
721         if (mkfifo(from, 0600) < 0) {
722                 log_error("mkfifo() for /dev/kmsg failed: %m");
723                 return -errno;
724         }
725
726         r = chmod_and_chown(from, 0600, 0, 0);
727         if (r < 0) {
728                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
729                 return r;
730         }
731
732         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
733                 log_error("Bind mount for /proc/kmsg failed: %m");
734                 return -errno;
735         }
736
737         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
738         if (fd < 0) {
739                 log_error("Failed to open fifo: %m");
740                 return -errno;
741         }
742
743         cmsg = CMSG_FIRSTHDR(&mh);
744         cmsg->cmsg_level = SOL_SOCKET;
745         cmsg->cmsg_type = SCM_RIGHTS;
746         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
747         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
748
749         mh.msg_controllen = cmsg->cmsg_len;
750
751         /* Store away the fd in the socket, so that it stays open as
752          * long as we run the child */
753         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
754         close_nointr_nofail(fd);
755
756         if (k < 0) {
757                 log_error("Failed to send FIFO fd: %m");
758                 return -errno;
759         }
760
761         /* And now make the FIFO unavailable as /dev/kmsg... */
762         unlink(from);
763         return 0;
764 }
765
766 static int setup_hostname(void) {
767
768         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
769                 return -errno;
770
771         return 0;
772 }
773
774 static int setup_journal(const char *directory) {
775         sd_id128_t machine_id;
776         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
777         char *id;
778         int r;
779
780         if (arg_link_journal == LINK_NO)
781                 return 0;
782
783         p = strappend(directory, "/etc/machine-id");
784         if (!p)
785                 return log_oom();
786
787         r = read_one_line_file(p, &b);
788         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
789                 return 0;
790         else if (r < 0) {
791                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
792                 return r;
793         }
794
795         id = strstrip(b);
796         if (isempty(id) && arg_link_journal == LINK_AUTO)
797                 return 0;
798
799         /* Verify validity */
800         r = sd_id128_from_string(id, &machine_id);
801         if (r < 0) {
802                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
803                 return r;
804         }
805
806         free(p);
807         p = strappend("/var/log/journal/", id);
808         q = strjoin(directory, "/var/log/journal/", id, NULL);
809         if (!p || !q)
810                 return log_oom();
811
812         if (path_is_mount_point(p, false) > 0) {
813                 if (arg_link_journal != LINK_AUTO) {
814                         log_error("%s: already a mount point, refusing to use for journal", p);
815                         return -EEXIST;
816                 }
817
818                 return 0;
819         }
820
821         if (path_is_mount_point(q, false) > 0) {
822                 if (arg_link_journal != LINK_AUTO) {
823                         log_error("%s: already a mount point, refusing to use for journal", q);
824                         return -EEXIST;
825                 }
826
827                 return 0;
828         }
829
830         r = readlink_and_make_absolute(p, &d);
831         if (r >= 0) {
832                 if ((arg_link_journal == LINK_GUEST ||
833                      arg_link_journal == LINK_AUTO) &&
834                     path_equal(d, q)) {
835
836                         r = mkdir_p(q, 0755);
837                         if (r < 0)
838                                 log_warning("failed to create directory %s: %m", q);
839                         return 0;
840                 }
841
842                 if (unlink(p) < 0) {
843                         log_error("Failed to remove symlink %s: %m", p);
844                         return -errno;
845                 }
846         } else if (r == -EINVAL) {
847
848                 if (arg_link_journal == LINK_GUEST &&
849                     rmdir(p) < 0) {
850
851                         if (errno == ENOTDIR) {
852                                 log_error("%s already exists and is neither a symlink nor a directory", p);
853                                 return r;
854                         } else {
855                                 log_error("Failed to remove %s: %m", p);
856                                 return -errno;
857                         }
858                 }
859         } else if (r != -ENOENT) {
860                 log_error("readlink(%s) failed: %m", p);
861                 return r;
862         }
863
864         if (arg_link_journal == LINK_GUEST) {
865
866                 if (symlink(q, p) < 0) {
867                         log_error("Failed to symlink %s to %s: %m", q, p);
868                         return -errno;
869                 }
870
871                 r = mkdir_p(q, 0755);
872                 if (r < 0)
873                         log_warning("failed to create directory %s: %m", q);
874                 return 0;
875         }
876
877         if (arg_link_journal == LINK_HOST) {
878                 r = mkdir_p(p, 0755);
879                 if (r < 0) {
880                         log_error("Failed to create %s: %m", p);
881                         return r;
882                 }
883
884         } else if (access(p, F_OK) < 0)
885                 return 0;
886
887         if (dir_is_empty(q) == 0) {
888                 log_error("%s not empty.", q);
889                 return -ENOTEMPTY;
890         }
891
892         r = mkdir_p(q, 0755);
893         if (r < 0) {
894                 log_error("Failed to create %s: %m", q);
895                 return r;
896         }
897
898         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
899                 log_error("Failed to bind mount journal from host into guest: %m");
900                 return -errno;
901         }
902
903         return 0;
904 }
905
906 static int setup_cgroup(const char *path) {
907         char **c;
908         int r;
909
910         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
911         if (r < 0) {
912                 log_error("Failed to create cgroup: %s", strerror(-r));
913                 return r;
914         }
915
916         STRV_FOREACH(c, arg_controllers) {
917                 r = cg_create_and_attach(*c, path, 1);
918                 if (r < 0)
919                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
920         }
921
922         return 0;
923 }
924
925 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
926         _cleanup_free_ char *path = NULL;
927         char buf[DECIMAL_STR_MAX(pid_t)];
928         int r = 0, k;
929
930         assert(cgroup);
931         assert(pid >= 0);
932         assert(arg_directory);
933
934 #ifdef HAVE_XATTR
935         assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
936
937         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
938         if (r < 0) {
939                 log_error("Failed to get path: %s", strerror(-r));
940                 return r;
941         }
942
943         r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
944         if (r < 0)
945                 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
946
947         if (uuid) {
948                 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
949                 if (k < 0) {
950                         log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
951                         if (r == 0)
952                                 r = k;
953                 }
954         }
955
956         k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
957         if (k < 0) {
958                 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
959                 if (r == 0)
960                         r = k;
961         }
962 #endif
963         return r;
964 }
965
966 static int drop_capabilities(void) {
967         return capability_bounding_set_drop(~arg_retain, false);
968 }
969
970 static int process_pty(int master, pid_t pid, sigset_t *mask) {
971
972         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
973         size_t in_buffer_full = 0, out_buffer_full = 0;
974         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
975         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
976         int ep = -1, signal_fd = -1, r;
977         bool tried_orderly_shutdown = false;
978
979         assert(master >= 0);
980         assert(pid > 0);
981         assert(mask);
982
983         fd_nonblock(STDIN_FILENO, 1);
984         fd_nonblock(STDOUT_FILENO, 1);
985         fd_nonblock(master, 1);
986
987         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
988         if (signal_fd < 0) {
989                 log_error("signalfd(): %m");
990                 r = -errno;
991                 goto finish;
992         }
993
994         ep = epoll_create1(EPOLL_CLOEXEC);
995         if (ep < 0) {
996                 log_error("Failed to create epoll: %m");
997                 r = -errno;
998                 goto finish;
999         }
1000
1001         /* We read from STDIN only if this is actually a TTY,
1002          * otherwise we assume non-interactivity. */
1003         if (isatty(STDIN_FILENO)) {
1004                 zero(stdin_ev);
1005                 stdin_ev.events = EPOLLIN|EPOLLET;
1006                 stdin_ev.data.fd = STDIN_FILENO;
1007
1008                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1009                         log_error("Failed to register STDIN in epoll: %m");
1010                         r = -errno;
1011                         goto finish;
1012                 }
1013         }
1014
1015         zero(stdout_ev);
1016         stdout_ev.events = EPOLLOUT|EPOLLET;
1017         stdout_ev.data.fd = STDOUT_FILENO;
1018
1019         zero(master_ev);
1020         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1021         master_ev.data.fd = master;
1022
1023         zero(signal_ev);
1024         signal_ev.events = EPOLLIN;
1025         signal_ev.data.fd = signal_fd;
1026
1027         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1028                 if (errno != EPERM) {
1029                         log_error("Failed to register stdout in epoll: %m");
1030                         r = -errno;
1031                         goto finish;
1032                 }
1033                 /* stdout without epoll support. Likely redirected to regular file. */
1034                 stdout_writable = true;
1035         }
1036
1037         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1038             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1039                 log_error("Failed to register fds in epoll: %m");
1040                 r = -errno;
1041                 goto finish;
1042         }
1043
1044         for (;;) {
1045                 struct epoll_event ev[16];
1046                 ssize_t k;
1047                 int i, nfds;
1048
1049                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1050                 if (nfds < 0) {
1051
1052                         if (errno == EINTR || errno == EAGAIN)
1053                                 continue;
1054
1055                         log_error("epoll_wait(): %m");
1056                         r = -errno;
1057                         goto finish;
1058                 }
1059
1060                 assert(nfds >= 1);
1061
1062                 for (i = 0; i < nfds; i++) {
1063                         if (ev[i].data.fd == STDIN_FILENO) {
1064
1065                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1066                                         stdin_readable = true;
1067
1068                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1069
1070                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1071                                         stdout_writable = true;
1072
1073                         } else if (ev[i].data.fd == master) {
1074
1075                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1076                                         master_readable = true;
1077
1078                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1079                                         master_writable = true;
1080
1081                         } else if (ev[i].data.fd == signal_fd) {
1082                                 struct signalfd_siginfo sfsi;
1083                                 ssize_t n;
1084
1085                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1086                                 if (n != sizeof(sfsi)) {
1087
1088                                         if (n >= 0) {
1089                                                 log_error("Failed to read from signalfd: invalid block size");
1090                                                 r = -EIO;
1091                                                 goto finish;
1092                                         }
1093
1094                                         if (errno != EINTR && errno != EAGAIN) {
1095                                                 log_error("Failed to read from signalfd: %m");
1096                                                 r = -errno;
1097                                                 goto finish;
1098                                         }
1099                                 } else {
1100
1101                                         if (sfsi.ssi_signo == SIGWINCH) {
1102                                                 struct winsize ws;
1103
1104                                                 /* The window size changed, let's forward that. */
1105                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1106                                                         ioctl(master, TIOCSWINSZ, &ws);
1107                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1108
1109                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1110
1111                                                 /* This only works for systemd... */
1112                                                 tried_orderly_shutdown = true;
1113                                                 kill(pid, SIGRTMIN+3);
1114
1115                                         } else {
1116                                                 r = 0;
1117                                                 goto finish;
1118                                         }
1119                                 }
1120                         }
1121                 }
1122
1123                 while ((stdin_readable && in_buffer_full <= 0) ||
1124                        (master_writable && in_buffer_full > 0) ||
1125                        (master_readable && out_buffer_full <= 0) ||
1126                        (stdout_writable && out_buffer_full > 0)) {
1127
1128                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1129
1130                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1131                                 if (k < 0) {
1132
1133                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1134                                                 stdin_readable = false;
1135                                         else {
1136                                                 log_error("read(): %m");
1137                                                 r = -errno;
1138                                                 goto finish;
1139                                         }
1140                                 } else
1141                                         in_buffer_full += (size_t) k;
1142                         }
1143
1144                         if (master_writable && in_buffer_full > 0) {
1145
1146                                 k = write(master, in_buffer, in_buffer_full);
1147                                 if (k < 0) {
1148
1149                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1150                                                 master_writable = false;
1151                                         else {
1152                                                 log_error("write(): %m");
1153                                                 r = -errno;
1154                                                 goto finish;
1155                                         }
1156
1157                                 } else {
1158                                         assert(in_buffer_full >= (size_t) k);
1159                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1160                                         in_buffer_full -= k;
1161                                 }
1162                         }
1163
1164                         if (master_readable && out_buffer_full < LINE_MAX) {
1165
1166                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1167                                 if (k < 0) {
1168
1169                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1170                                                 master_readable = false;
1171                                         else {
1172                                                 log_error("read(): %m");
1173                                                 r = -errno;
1174                                                 goto finish;
1175                                         }
1176                                 }  else
1177                                         out_buffer_full += (size_t) k;
1178                         }
1179
1180                         if (stdout_writable && out_buffer_full > 0) {
1181
1182                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1183                                 if (k < 0) {
1184
1185                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1186                                                 stdout_writable = false;
1187                                         else {
1188                                                 log_error("write(): %m");
1189                                                 r = -errno;
1190                                                 goto finish;
1191                                         }
1192
1193                                 } else {
1194                                         assert(out_buffer_full >= (size_t) k);
1195                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1196                                         out_buffer_full -= k;
1197                                 }
1198                         }
1199                 }
1200         }
1201
1202 finish:
1203         if (ep >= 0)
1204                 close_nointr_nofail(ep);
1205
1206         if (signal_fd >= 0)
1207                 close_nointr_nofail(signal_fd);
1208
1209         return r;
1210 }
1211
1212 int main(int argc, char *argv[]) {
1213         pid_t pid = 0;
1214         int r = EXIT_FAILURE, k;
1215         _cleanup_free_ char *machine_root = NULL, *name = NULL, *escaped = NULL, *newcg = NULL;
1216         _cleanup_close_ int master = -1;
1217         int n_fd_passed;
1218         const char *console = NULL;
1219         struct termios saved_attr, raw_attr;
1220         sigset_t mask;
1221         bool saved_attr_valid = false;
1222         struct winsize ws;
1223         int kmsg_socket_pair[2] = { -1, -1 };
1224         FDSet *fds = NULL;
1225
1226         log_parse_environment();
1227         log_open();
1228
1229         r = parse_argv(argc, argv);
1230         if (r <= 0)
1231                 goto finish;
1232
1233         if (arg_directory) {
1234                 char *p;
1235
1236                 p = path_make_absolute_cwd(arg_directory);
1237                 free(arg_directory);
1238                 arg_directory = p;
1239         } else
1240                 arg_directory = get_current_dir_name();
1241
1242         if (!arg_directory) {
1243                 log_error("Failed to determine path, please use -D.");
1244                 goto finish;
1245         }
1246
1247         path_kill_slashes(arg_directory);
1248
1249         if (!arg_machine) {
1250                 arg_machine = strdup(path_get_file_name(arg_directory));
1251                 if (!arg_machine) {
1252                         log_oom();
1253                         goto finish;
1254                 }
1255
1256                 hostname_cleanup(arg_machine);
1257                 if (isempty(arg_machine)) {
1258                         log_error("Failed to determine machine name automatically, please use -M.");
1259                         goto finish;
1260                 }
1261         }
1262
1263         if (geteuid() != 0) {
1264                 log_error("Need to be root.");
1265                 goto finish;
1266         }
1267
1268         if (sd_booted() <= 0) {
1269                 log_error("Not running on a systemd system.");
1270                 goto finish;
1271         }
1272
1273         if (path_equal(arg_directory, "/")) {
1274                 log_error("Spawning container on root directory not supported.");
1275                 goto finish;
1276         }
1277
1278         if (path_is_os_tree(arg_directory) <= 0) {
1279                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1280                 goto finish;
1281         }
1282
1283         log_close();
1284         n_fd_passed = sd_listen_fds(false);
1285         if (n_fd_passed > 0) {
1286                 k = fdset_new_listen_fds(&fds, false);
1287                 if (k < 0) {
1288                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1289                         goto finish;
1290                 }
1291         }
1292         fdset_close_others(fds);
1293         log_open();
1294
1295         k = cg_get_machine_path(&machine_root);
1296         if (k < 0) {
1297                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1298                 goto finish;
1299         }
1300
1301         name = strappend(arg_machine, ".nspawn");
1302         if (!name) {
1303                 log_oom();
1304                 goto finish;
1305         }
1306
1307         escaped = cg_escape(name);
1308         if (!escaped) {
1309                 log_oom();
1310                 goto finish;
1311         }
1312
1313         newcg = strjoin(machine_root, "/", escaped, NULL);
1314         if (!newcg) {
1315                 log_oom();
1316                 goto finish;
1317         }
1318
1319         r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, false);
1320         if (r <= 0 && r != -ENOENT) {
1321                 log_error("Container already running.");
1322
1323                 free(newcg);
1324                 newcg = NULL;
1325
1326                 goto finish;
1327         }
1328
1329         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1330         if (master < 0) {
1331                 log_error("Failed to acquire pseudo tty: %m");
1332                 goto finish;
1333         }
1334
1335         console = ptsname(master);
1336         if (!console) {
1337                 log_error("Failed to determine tty name: %m");
1338                 goto finish;
1339         }
1340
1341         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1342
1343         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1344                 ioctl(master, TIOCSWINSZ, &ws);
1345
1346         if (unlockpt(master) < 0) {
1347                 log_error("Failed to unlock tty: %m");
1348                 goto finish;
1349         }
1350
1351         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1352                 saved_attr_valid = true;
1353
1354                 raw_attr = saved_attr;
1355                 cfmakeraw(&raw_attr);
1356                 raw_attr.c_lflag &= ~ECHO;
1357         }
1358
1359         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1360                 log_error("Failed to create kmsg socket pair.");
1361                 goto finish;
1362         }
1363
1364         assert_se(sigemptyset(&mask) == 0);
1365         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1366         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1367
1368         for (;;) {
1369                 siginfo_t status;
1370                 int pipefd[2], pipefd2[2];
1371
1372                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1373                         log_error("pipe2(): %m");
1374                         goto finish;
1375                 }
1376
1377                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1378                         log_error("pipe2(): %m");
1379                         close_pipe(pipefd);
1380                         goto finish;
1381                 }
1382
1383                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1384                 if (pid < 0) {
1385                         if (errno == EINVAL)
1386                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1387                         else
1388                                 log_error("clone() failed: %m");
1389
1390                         goto finish;
1391                 }
1392
1393                 if (pid == 0) {
1394                         /* child */
1395                         const char *home = NULL;
1396                         uid_t uid = (uid_t) -1;
1397                         gid_t gid = (gid_t) -1;
1398                         unsigned n_env = 2;
1399                         const char *envp[] = {
1400                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1401                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1402                                 NULL, /* TERM */
1403                                 NULL, /* HOME */
1404                                 NULL, /* USER */
1405                                 NULL, /* LOGNAME */
1406                                 NULL, /* container_uuid */
1407                                 NULL, /* LISTEN_FDS */
1408                                 NULL, /* LISTEN_PID */
1409                                 NULL
1410                         };
1411
1412                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1413                         if (envp[n_env])
1414                                 n_env ++;
1415
1416                         /* Wait for the parent process to log our PID */
1417                         close_nointr_nofail(pipefd[1]);
1418                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1419                         close_nointr_nofail(pipefd[0]);
1420
1421                         close_nointr_nofail(master);
1422                         master = -1;
1423
1424                         if (saved_attr_valid) {
1425                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1426                                         log_error("Failed to set terminal attributes: %m");
1427                                         goto child_fail;
1428                                 }
1429                         }
1430
1431                         close_nointr(STDIN_FILENO);
1432                         close_nointr(STDOUT_FILENO);
1433                         close_nointr(STDERR_FILENO);
1434
1435                         close_nointr_nofail(kmsg_socket_pair[0]);
1436                         kmsg_socket_pair[0] = -1;
1437
1438                         reset_all_signal_handlers();
1439
1440                         assert_se(sigemptyset(&mask) == 0);
1441                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1442
1443                         k = open_terminal(console, O_RDWR);
1444                         if (k != STDIN_FILENO) {
1445                                 if (k >= 0) {
1446                                         close_nointr_nofail(k);
1447                                         k = -EINVAL;
1448                                 }
1449
1450                                 log_error("Failed to open console: %s", strerror(-k));
1451                                 goto child_fail;
1452                         }
1453
1454                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1455                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1456                                 log_error("Failed to duplicate console: %m");
1457                                 goto child_fail;
1458                         }
1459
1460                         if (setsid() < 0) {
1461                                 log_error("setsid() failed: %m");
1462                                 goto child_fail;
1463                         }
1464
1465                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1466                                 log_error("PR_SET_PDEATHSIG failed: %m");
1467                                 goto child_fail;
1468                         }
1469
1470                         if (setup_cgroup(newcg) < 0)
1471                                 goto child_fail;
1472
1473                         close_pipe(pipefd2);
1474
1475                         /* Mark everything as slave, so that we still
1476                          * receive mounts from the real root, but don't
1477                          * propagate mounts to the real root. */
1478                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1479                                 log_error("MS_SLAVE|MS_REC failed: %m");
1480                                 goto child_fail;
1481                         }
1482
1483                         /* Turn directory into bind mount */
1484                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1485                                 log_error("Failed to make bind mount.");
1486                                 goto child_fail;
1487                         }
1488
1489                         if (arg_read_only)
1490                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1491                                         log_error("Failed to make read-only.");
1492                                         goto child_fail;
1493                                 }
1494
1495                         if (mount_all(arg_directory) < 0)
1496                                 goto child_fail;
1497
1498                         if (copy_devnodes(arg_directory) < 0)
1499                                 goto child_fail;
1500
1501                         if (setup_ptmx(arg_directory) < 0)
1502                                 goto child_fail;
1503
1504                         dev_setup(arg_directory);
1505
1506                         if (setup_dev_console(arg_directory, console) < 0)
1507                                 goto child_fail;
1508
1509                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1510                                 goto child_fail;
1511
1512                         close_nointr_nofail(kmsg_socket_pair[1]);
1513                         kmsg_socket_pair[1] = -1;
1514
1515                         if (setup_boot_id(arg_directory) < 0)
1516                                 goto child_fail;
1517
1518                         if (setup_timezone(arg_directory) < 0)
1519                                 goto child_fail;
1520
1521                         if (setup_resolv_conf(arg_directory) < 0)
1522                                 goto child_fail;
1523
1524                         if (setup_journal(arg_directory) < 0)
1525                                 goto child_fail;
1526
1527                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1528                                 goto child_fail;
1529
1530                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1531                                 goto child_fail;
1532
1533                         if (chdir(arg_directory) < 0) {
1534                                 log_error("chdir(%s) failed: %m", arg_directory);
1535                                 goto child_fail;
1536                         }
1537
1538                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1539                                 log_error("mount(MS_MOVE) failed: %m");
1540                                 goto child_fail;
1541                         }
1542
1543                         if (chroot(".") < 0) {
1544                                 log_error("chroot() failed: %m");
1545                                 goto child_fail;
1546                         }
1547
1548                         if (chdir("/") < 0) {
1549                                 log_error("chdir() failed: %m");
1550                                 goto child_fail;
1551                         }
1552
1553                         umask(0022);
1554
1555                         loopback_setup();
1556
1557                         if (drop_capabilities() < 0) {
1558                                 log_error("drop_capabilities() failed: %m");
1559                                 goto child_fail;
1560                         }
1561
1562                         if (arg_user) {
1563
1564                                 /* Note that this resolves user names
1565                                  * inside the container, and hence
1566                                  * accesses the NSS modules from the
1567                                  * container and not the host. This is
1568                                  * a bit weird... */
1569
1570                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1571                                         log_error("get_user_creds() failed: %m");
1572                                         goto child_fail;
1573                                 }
1574
1575                                 if (mkdir_parents_label(home, 0775) < 0) {
1576                                         log_error("mkdir_parents_label() failed: %m");
1577                                         goto child_fail;
1578                                 }
1579
1580                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1581                                         log_error("mkdir_safe_label() failed: %m");
1582                                         goto child_fail;
1583                                 }
1584
1585                                 if (initgroups((const char*)arg_user, gid) < 0) {
1586                                         log_error("initgroups() failed: %m");
1587                                         goto child_fail;
1588                                 }
1589
1590                                 if (setresgid(gid, gid, gid) < 0) {
1591                                         log_error("setregid() failed: %m");
1592                                         goto child_fail;
1593                                 }
1594
1595                                 if (setresuid(uid, uid, uid) < 0) {
1596                                         log_error("setreuid() failed: %m");
1597                                         goto child_fail;
1598                                 }
1599                         } else {
1600                                 /* Reset everything fully to 0, just in case */
1601
1602                                 if (setgroups(0, NULL) < 0) {
1603                                         log_error("setgroups() failed: %m");
1604                                         goto child_fail;
1605                                 }
1606
1607                                 if (setresgid(0, 0, 0) < 0) {
1608                                         log_error("setregid() failed: %m");
1609                                         goto child_fail;
1610                                 }
1611
1612                                 if (setresuid(0, 0, 0) < 0) {
1613                                         log_error("setreuid() failed: %m");
1614                                         goto child_fail;
1615                                 }
1616                         }
1617
1618                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1619                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1620                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1621                                 log_oom();
1622                                 goto child_fail;
1623                         }
1624
1625                         if (arg_uuid) {
1626                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1627                                         log_oom();
1628                                         goto child_fail;
1629                                 }
1630                         }
1631
1632                         if (fdset_size(fds) > 0) {
1633                                 k = fdset_cloexec(fds, false);
1634                                 if (k < 0) {
1635                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1636                                         goto child_fail;
1637                                 }
1638
1639                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1640                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1641                                         log_oom();
1642                                         goto child_fail;
1643                                 }
1644                         }
1645
1646                         setup_hostname();
1647
1648                         if (arg_boot) {
1649                                 char **a;
1650                                 size_t l;
1651
1652                                 /* Automatically search for the init system */
1653
1654                                 l = 1 + argc - optind;
1655                                 a = newa(char*, l + 1);
1656                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1657
1658                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1659                                 execve(a[0], a, (char**) envp);
1660
1661                                 a[0] = (char*) "/lib/systemd/systemd";
1662                                 execve(a[0], a, (char**) envp);
1663
1664                                 a[0] = (char*) "/sbin/init";
1665                                 execve(a[0], a, (char**) envp);
1666                         } else if (argc > optind)
1667                                 execvpe(argv[optind], argv + optind, (char**) envp);
1668                         else {
1669                                 chdir(home ? home : "/root");
1670                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1671                         }
1672
1673                         log_error("execv() failed: %m");
1674
1675                 child_fail:
1676                         _exit(EXIT_FAILURE);
1677                 }
1678
1679                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1680                 close_nointr_nofail(pipefd[0]);
1681                 close_nointr_nofail(pipefd[1]);
1682
1683                 /* Wait for the child process to establish cgroup hierarchy */
1684                 close_nointr_nofail(pipefd2[1]);
1685                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1686                 close_nointr_nofail(pipefd2[0]);
1687
1688                 save_attributes(newcg, pid, arg_uuid, arg_directory);
1689
1690                 fdset_free(fds);
1691                 fds = NULL;
1692
1693                 if (process_pty(master, pid, &mask) < 0)
1694                         goto finish;
1695
1696                 if (saved_attr_valid)
1697                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1698
1699                 r = wait_for_terminate(pid, &status);
1700                 if (r < 0) {
1701                         r = EXIT_FAILURE;
1702                         break;
1703                 }
1704
1705                 if (status.si_code == CLD_EXITED) {
1706                         if (status.si_status != 0) {
1707                                 log_error("Container failed with error code %i.", status.si_status);
1708                                 r = status.si_status;
1709                                 break;
1710                         }
1711
1712                         log_debug("Container exited successfully.");
1713                         break;
1714                 } else if (status.si_code == CLD_KILLED &&
1715                            status.si_status == SIGINT) {
1716                         log_info("Container has been shut down.");
1717                         r = 0;
1718                         break;
1719                 } else if (status.si_code == CLD_KILLED &&
1720                            status.si_status == SIGHUP) {
1721                         log_info("Container is being rebooted.");
1722                         continue;
1723                 } else if (status.si_code == CLD_KILLED ||
1724                            status.si_code == CLD_DUMPED) {
1725
1726                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1727                         r = EXIT_FAILURE;
1728                         break;
1729                 } else {
1730                         log_error("Container failed due to unknown reason.");
1731                         r = EXIT_FAILURE;
1732                         break;
1733                 }
1734         }
1735
1736 finish:
1737         if (saved_attr_valid)
1738                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1739
1740         close_pipe(kmsg_socket_pair);
1741
1742         if (newcg)
1743                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1744
1745         free(arg_directory);
1746         free(arg_machine);
1747         strv_free(arg_controllers);
1748
1749         fdset_free(fds);
1750
1751         return r;
1752 }