chiark / gitweb /
15e48739b9a3a015a890ba0707a0e53e907315ec
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
45
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
48
49 #include "log.h"
50 #include "util.h"
51 #include "mkdir.h"
52 #include "macro.h"
53 #include "audit.h"
54 #include "missing.h"
55 #include "cgroup-util.h"
56 #include "strv.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
59 #include "sd-id128.h"
60 #include "dev-setup.h"
61 #include "fdset.h"
62 #include "build.h"
63 #include "fileio.h"
64 #include "bus-internal.h"
65 #include "bus-message.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72         LINK_NO,
73         LINK_AUTO,
74         LINK_HOST,
75         LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88         (1ULL << CAP_CHOWN) |
89         (1ULL << CAP_DAC_OVERRIDE) |
90         (1ULL << CAP_DAC_READ_SEARCH) |
91         (1ULL << CAP_FOWNER) |
92         (1ULL << CAP_FSETID) |
93         (1ULL << CAP_IPC_OWNER) |
94         (1ULL << CAP_KILL) |
95         (1ULL << CAP_LEASE) |
96         (1ULL << CAP_LINUX_IMMUTABLE) |
97         (1ULL << CAP_NET_BIND_SERVICE) |
98         (1ULL << CAP_NET_BROADCAST) |
99         (1ULL << CAP_NET_RAW) |
100         (1ULL << CAP_SETGID) |
101         (1ULL << CAP_SETFCAP) |
102         (1ULL << CAP_SETPCAP) |
103         (1ULL << CAP_SETUID) |
104         (1ULL << CAP_SYS_ADMIN) |
105         (1ULL << CAP_SYS_CHROOT) |
106         (1ULL << CAP_SYS_NICE) |
107         (1ULL << CAP_SYS_PTRACE) |
108         (1ULL << CAP_SYS_TTY_CONFIG) |
109         (1ULL << CAP_SYS_RESOURCE) |
110         (1ULL << CAP_SYS_BOOT) |
111         (1ULL << CAP_AUDIT_WRITE) |
112         (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120                "  -h --help                Show this help\n"
121                "     --version             Print version string\n"
122                "  -D --directory=NAME      Root directory for the container\n"
123                "  -b --boot                Boot up full system (i.e. invoke init)\n"
124                "  -u --user=USER           Run the command under specified user or uid\n"
125                "     --uuid=UUID           Set a specific machine UUID for the container\n"
126                "  -M --machine=NAME        Set the machine name for the container\n"
127                "  -S --slice=SLICE         Place the container in the specified slice\n"
128                "     --private-network     Disable network in container\n"
129                "     --read-only           Mount the root directory read-only\n"
130                "     --capability=CAP      In addition to the default, retain specified\n"
131                "                           capability\n"
132                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
133                "  -j                       Equivalent to --link-journal=host\n"
134                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
135                "                           the container\n"
136                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137                program_invocation_short_name);
138
139         return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144         enum {
145                 ARG_VERSION = 0x100,
146                 ARG_PRIVATE_NETWORK,
147                 ARG_UUID,
148                 ARG_READ_ONLY,
149                 ARG_CAPABILITY,
150                 ARG_LINK_JOURNAL,
151                 ARG_BIND,
152                 ARG_BIND_RO
153         };
154
155         static const struct option options[] = {
156                 { "help",            no_argument,       NULL, 'h'                 },
157                 { "version",         no_argument,       NULL, ARG_VERSION         },
158                 { "directory",       required_argument, NULL, 'D'                 },
159                 { "user",            required_argument, NULL, 'u'                 },
160                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
161                 { "boot",            no_argument,       NULL, 'b'                 },
162                 { "uuid",            required_argument, NULL, ARG_UUID            },
163                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
164                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
165                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
166                 { "bind",            required_argument, NULL, ARG_BIND            },
167                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
168                 { "machine",         required_argument, NULL, 'M'                 },
169                 { "slice",           required_argument, NULL, 'S'                 },
170                 { NULL,              0,                 NULL, 0                   }
171         };
172
173         int c, r;
174
175         assert(argc >= 0);
176         assert(argv);
177
178         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
179
180                 switch (c) {
181
182                 case 'h':
183                         help();
184                         return 0;
185
186                 case ARG_VERSION:
187                         puts(PACKAGE_STRING);
188                         puts(SYSTEMD_FEATURES);
189                         return 0;
190
191                 case 'D':
192                         free(arg_directory);
193                         arg_directory = canonicalize_file_name(optarg);
194                         if (!arg_directory) {
195                                 log_error("Failed to canonicalize root directory.");
196                                 return -ENOMEM;
197                         }
198
199                         break;
200
201                 case 'u':
202                         free(arg_user);
203                         arg_user = strdup(optarg);
204                         if (!arg_user)
205                                 return log_oom();
206
207                         break;
208
209                 case ARG_PRIVATE_NETWORK:
210                         arg_private_network = true;
211                         break;
212
213                 case 'b':
214                         arg_boot = true;
215                         break;
216
217                 case ARG_UUID:
218                         r = sd_id128_from_string(optarg, &arg_uuid);
219                         if (r < 0) {
220                                 log_error("Invalid UUID: %s", optarg);
221                                 return r;
222                         }
223                         break;
224
225                 case 'S':
226                         arg_slice = strdup(optarg);
227                         break;
228
229                 case 'M':
230                         if (!hostname_is_valid(optarg)) {
231                                 log_error("Invalid machine name: %s", optarg);
232                                 return -EINVAL;
233                         }
234
235                         free(arg_machine);
236                         arg_machine = strdup(optarg);
237                         if (!arg_machine)
238                                 return log_oom();
239
240                         break;
241
242                 case ARG_READ_ONLY:
243                         arg_read_only = true;
244                         break;
245
246                 case ARG_CAPABILITY: {
247                         char *state, *word;
248                         size_t length;
249
250                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
251                                 cap_value_t cap;
252                                 char *t;
253
254                                 t = strndup(word, length);
255                                 if (!t)
256                                         return log_oom();
257
258                                 if (cap_from_name(t, &cap) < 0) {
259                                         log_error("Failed to parse capability %s.", t);
260                                         free(t);
261                                         return -EINVAL;
262                                 }
263
264                                 free(t);
265                                 arg_retain |= 1ULL << (uint64_t) cap;
266                         }
267
268                         break;
269                 }
270
271                 case 'j':
272                         arg_link_journal = LINK_GUEST;
273                         break;
274
275                 case ARG_LINK_JOURNAL:
276                         if (streq(optarg, "auto"))
277                                 arg_link_journal = LINK_AUTO;
278                         else if (streq(optarg, "no"))
279                                 arg_link_journal = LINK_NO;
280                         else if (streq(optarg, "guest"))
281                                 arg_link_journal = LINK_GUEST;
282                         else if (streq(optarg, "host"))
283                                 arg_link_journal = LINK_HOST;
284                         else {
285                                 log_error("Failed to parse link journal mode %s", optarg);
286                                 return -EINVAL;
287                         }
288
289                         break;
290
291                 case ARG_BIND:
292                 case ARG_BIND_RO: {
293                         _cleanup_free_ char *a = NULL, *b = NULL;
294                         char *e;
295                         char ***x;
296
297                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299                         e = strchr(optarg, ':');
300                         if (e) {
301                                 a = strndup(optarg, e - optarg);
302                                 b = strdup(e + 1);
303                         } else {
304                                 a = strdup(optarg);
305                                 b = strdup(optarg);
306                         }
307
308                         if (!a || !b)
309                                 return log_oom();
310
311                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
312                                 log_error("Invalid bind mount specification: %s", optarg);
313                                 return -EINVAL;
314                         }
315
316                         r = strv_extend(x, a);
317                         if (r < 0)
318                                 return r;
319
320                         r = strv_extend(x, b);
321                         if (r < 0)
322                                 return r;
323
324                         break;
325                 }
326
327                 case '?':
328                         return -EINVAL;
329
330                 default:
331                         log_error("Unknown option code %c", c);
332                         return -EINVAL;
333                 }
334         }
335
336         return 1;
337 }
338
339 static int mount_all(const char *dest) {
340
341         typedef struct MountPoint {
342                 const char *what;
343                 const char *where;
344                 const char *type;
345                 const char *options;
346                 unsigned long flags;
347                 bool fatal;
348         } MountPoint;
349
350         static const MountPoint mount_table[] = {
351                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
352                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
353                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
354                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
355                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
356                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
359 #ifdef HAVE_SELINUX
360                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
361                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
362 #endif
363         };
364
365         unsigned k;
366         int r = 0;
367
368         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369                 _cleanup_free_ char *where = NULL;
370                 int t;
371
372                 where = strjoin(dest, "/", mount_table[k].where, NULL);
373                 if (!where)
374                         return log_oom();
375
376                 t = path_is_mount_point(where, true);
377                 if (t < 0) {
378                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
379
380                         if (r == 0)
381                                 r = t;
382
383                         continue;
384                 }
385
386                 /* Skip this entry if it is not a remount. */
387                 if (mount_table[k].what && t > 0)
388                         continue;
389
390                 mkdir_p(where, 0755);
391
392                 if (mount(mount_table[k].what,
393                           where,
394                           mount_table[k].type,
395                           mount_table[k].flags,
396                           mount_table[k].options) < 0 &&
397                     mount_table[k].fatal) {
398
399                         log_error("mount(%s) failed: %m", where);
400
401                         if (r == 0)
402                                 r = -errno;
403                 }
404         }
405
406         return r;
407 }
408
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
410         char **x, **y;
411
412         STRV_FOREACH_PAIR(x, y, l) {
413                 _cleanup_free_ char *where = NULL;
414                 struct stat source_st, dest_st;
415
416                 if (stat(*x, &source_st) < 0) {
417                         log_error("failed to stat %s: %m", *x);
418                         return -errno;
419                 }
420
421                 where = strjoin(dest, "/", *y, NULL);
422                 if (!where)
423                         return log_oom();
424
425                 if (stat(where, &dest_st) == 0) {
426                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
427                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
428                                                 *x, where);
429                                 return -EINVAL;
430                         }
431                 } else {
432                         /* Create the mount point, but be conservative -- refuse to create block
433                          * and char devices. */
434                         if (S_ISDIR(source_st.st_mode))
435                                 mkdir_p_label(where, 0755);
436                         else if (S_ISFIFO(source_st.st_mode))
437                                 mkfifo(where, 0644);
438                         else if (S_ISSOCK(source_st.st_mode))
439                                 mknod(where, 0644 | S_IFSOCK, 0);
440                         else if (S_ISREG(source_st.st_mode))
441                                 touch(where);
442                         else {
443                                 log_error("Refusing to create mountpoint for file: %s", *x);
444                                 return -ENOTSUP;
445                         }
446                 }
447
448                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
449                         log_error("mount(%s) failed: %m", where);
450                         return -errno;
451                 }
452
453                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
454                         log_error("mount(%s) failed: %m", where);
455                         return -errno;
456                 }
457         }
458
459         return 0;
460 }
461
462 static int setup_timezone(const char *dest) {
463         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
464         char *z, *y;
465         int r;
466
467         assert(dest);
468
469         /* Fix the timezone, if possible */
470         r = readlink_malloc("/etc/localtime", &p);
471         if (r < 0) {
472                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
473                 return 0;
474         }
475
476         z = path_startswith(p, "../usr/share/zoneinfo/");
477         if (!z)
478                 z = path_startswith(p, "/usr/share/zoneinfo/");
479         if (!z) {
480                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
481                 return 0;
482         }
483
484         where = strappend(dest, "/etc/localtime");
485         if (!where)
486                 return log_oom();
487
488         r = readlink_malloc(where, &q);
489         if (r >= 0) {
490                 y = path_startswith(q, "../usr/share/zoneinfo/");
491                 if (!y)
492                         y = path_startswith(q, "/usr/share/zoneinfo/");
493
494
495                 /* Already pointing to the right place? Then do nothing .. */
496                 if (y && streq(y, z))
497                         return 0;
498         }
499
500         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
501         if (!check)
502                 return log_oom();
503
504         if (access(check, F_OK) < 0) {
505                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
506                 return 0;
507         }
508
509         what = strappend("../usr/share/zoneinfo/", z);
510         if (!what)
511                 return log_oom();
512
513         unlink(where);
514         if (symlink(what, where) < 0) {
515                 log_error("Failed to correct timezone of container: %m");
516                 return 0;
517         }
518
519         return 0;
520 }
521
522 static int setup_resolv_conf(const char *dest) {
523         char _cleanup_free_ *where = NULL;
524
525         assert(dest);
526
527         if (arg_private_network)
528                 return 0;
529
530         /* Fix resolv.conf, if possible */
531         where = strappend(dest, "/etc/resolv.conf");
532         if (!where)
533                 return log_oom();
534
535         /* We don't really care for the results of this really. If it
536          * fails, it fails, but meh... */
537         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
538
539         return 0;
540 }
541
542 static int setup_boot_id(const char *dest) {
543         _cleanup_free_ char *from = NULL, *to = NULL;
544         sd_id128_t rnd;
545         char as_uuid[37];
546         int r;
547
548         assert(dest);
549
550         /* Generate a new randomized boot ID, so that each boot-up of
551          * the container gets a new one */
552
553         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
554         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
555         if (!from || !to)
556                 return log_oom();
557
558         r = sd_id128_randomize(&rnd);
559         if (r < 0) {
560                 log_error("Failed to generate random boot id: %s", strerror(-r));
561                 return r;
562         }
563
564         snprintf(as_uuid, sizeof(as_uuid),
565                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
566                  SD_ID128_FORMAT_VAL(rnd));
567         char_array_0(as_uuid);
568
569         r = write_string_file(from, as_uuid);
570         if (r < 0) {
571                 log_error("Failed to write boot id: %s", strerror(-r));
572                 return r;
573         }
574
575         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
576                 log_error("Failed to bind mount boot id: %m");
577                 r = -errno;
578         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
579                 log_warning("Failed to make boot id read-only: %m");
580
581         unlink(from);
582         return r;
583 }
584
585 static int copy_devnodes(const char *dest) {
586
587         static const char devnodes[] =
588                 "null\0"
589                 "zero\0"
590                 "full\0"
591                 "random\0"
592                 "urandom\0"
593                 "tty\0";
594
595         const char *d;
596         int r = 0;
597         _cleanup_umask_ mode_t u;
598
599         assert(dest);
600
601         u = umask(0000);
602
603         NULSTR_FOREACH(d, devnodes) {
604                 struct stat st;
605                 _cleanup_free_ char *from = NULL, *to = NULL;
606
607                 asprintf(&from, "/dev/%s", d);
608                 asprintf(&to, "%s/dev/%s", dest, d);
609
610                 if (!from || !to) {
611                         log_oom();
612
613                         if (r == 0)
614                                 r = -ENOMEM;
615
616                         break;
617                 }
618
619                 if (stat(from, &st) < 0) {
620
621                         if (errno != ENOENT) {
622                                 log_error("Failed to stat %s: %m", from);
623                                 if (r == 0)
624                                         r = -errno;
625                         }
626
627                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
628
629                         log_error("%s is not a char or block device, cannot copy", from);
630                         if (r == 0)
631                                 r = -EIO;
632
633                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
634
635                         log_error("mknod(%s) failed: %m", dest);
636                         if (r == 0)
637                                 r = -errno;
638                 }
639         }
640
641         return r;
642 }
643
644 static int setup_ptmx(const char *dest) {
645         _cleanup_free_ char *p = NULL;
646
647         p = strappend(dest, "/dev/ptmx");
648         if (!p)
649                 return log_oom();
650
651         if (symlink("pts/ptmx", p) < 0) {
652                 log_error("Failed to create /dev/ptmx symlink: %m");
653                 return -errno;
654         }
655
656         return 0;
657 }
658
659 static int setup_dev_console(const char *dest, const char *console) {
660         struct stat st;
661         _cleanup_free_ char *to = NULL;
662         int r;
663         _cleanup_umask_ mode_t u;
664
665         assert(dest);
666         assert(console);
667
668         u = umask(0000);
669
670         if (stat(console, &st) < 0) {
671                 log_error("Failed to stat %s: %m", console);
672                 return -errno;
673
674         } else if (!S_ISCHR(st.st_mode)) {
675                 log_error("/dev/console is not a char device");
676                 return -EIO;
677         }
678
679         r = chmod_and_chown(console, 0600, 0, 0);
680         if (r < 0) {
681                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
682                 return r;
683         }
684
685         if (asprintf(&to, "%s/dev/console", dest) < 0)
686                 return log_oom();
687
688         /* We need to bind mount the right tty to /dev/console since
689          * ptys can only exist on pts file systems. To have something
690          * to bind mount things on we create a device node first, that
691          * has the right major/minor (note that the major minor
692          * doesn't actually matter here, since we mount it over
693          * anyway). */
694
695         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
696                 log_error("mknod() for /dev/console failed: %m");
697                 return -errno;
698         }
699
700         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
701                 log_error("Bind mount for /dev/console failed: %m");
702                 return -errno;
703         }
704
705         return 0;
706 }
707
708 static int setup_kmsg(const char *dest, int kmsg_socket) {
709         _cleanup_free_ char *from = NULL, *to = NULL;
710         int r, fd, k;
711         _cleanup_umask_ mode_t u;
712         union {
713                 struct cmsghdr cmsghdr;
714                 uint8_t buf[CMSG_SPACE(sizeof(int))];
715         } control = {};
716         struct msghdr mh = {
717                 .msg_control = &control,
718                 .msg_controllen = sizeof(control),
719         };
720         struct cmsghdr *cmsg;
721
722         assert(dest);
723         assert(kmsg_socket >= 0);
724
725         u = umask(0000);
726
727         /* We create the kmsg FIFO as /dev/kmsg, but immediately
728          * delete it after bind mounting it to /proc/kmsg. While FIFOs
729          * on the reading side behave very similar to /proc/kmsg,
730          * their writing side behaves differently from /dev/kmsg in
731          * that writing blocks when nothing is reading. In order to
732          * avoid any problems with containers deadlocking due to this
733          * we simply make /dev/kmsg unavailable to the container. */
734         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
735             asprintf(&to, "%s/proc/kmsg", dest) < 0)
736                 return log_oom();
737
738         if (mkfifo(from, 0600) < 0) {
739                 log_error("mkfifo() for /dev/kmsg failed: %m");
740                 return -errno;
741         }
742
743         r = chmod_and_chown(from, 0600, 0, 0);
744         if (r < 0) {
745                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
746                 return r;
747         }
748
749         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
750                 log_error("Bind mount for /proc/kmsg failed: %m");
751                 return -errno;
752         }
753
754         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
755         if (fd < 0) {
756                 log_error("Failed to open fifo: %m");
757                 return -errno;
758         }
759
760         cmsg = CMSG_FIRSTHDR(&mh);
761         cmsg->cmsg_level = SOL_SOCKET;
762         cmsg->cmsg_type = SCM_RIGHTS;
763         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
764         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
765
766         mh.msg_controllen = cmsg->cmsg_len;
767
768         /* Store away the fd in the socket, so that it stays open as
769          * long as we run the child */
770         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
771         close_nointr_nofail(fd);
772
773         if (k < 0) {
774                 log_error("Failed to send FIFO fd: %m");
775                 return -errno;
776         }
777
778         /* And now make the FIFO unavailable as /dev/kmsg... */
779         unlink(from);
780         return 0;
781 }
782
783 static int setup_hostname(void) {
784
785         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
786                 return -errno;
787
788         return 0;
789 }
790
791 static int setup_journal(const char *directory) {
792         sd_id128_t machine_id;
793         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
794         char *id;
795         int r;
796
797         if (arg_link_journal == LINK_NO)
798                 return 0;
799
800         p = strappend(directory, "/etc/machine-id");
801         if (!p)
802                 return log_oom();
803
804         r = read_one_line_file(p, &b);
805         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
806                 return 0;
807         else if (r < 0) {
808                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
809                 return r;
810         }
811
812         id = strstrip(b);
813         if (isempty(id) && arg_link_journal == LINK_AUTO)
814                 return 0;
815
816         /* Verify validity */
817         r = sd_id128_from_string(id, &machine_id);
818         if (r < 0) {
819                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
820                 return r;
821         }
822
823         free(p);
824         p = strappend("/var/log/journal/", id);
825         q = strjoin(directory, "/var/log/journal/", id, NULL);
826         if (!p || !q)
827                 return log_oom();
828
829         if (path_is_mount_point(p, false) > 0) {
830                 if (arg_link_journal != LINK_AUTO) {
831                         log_error("%s: already a mount point, refusing to use for journal", p);
832                         return -EEXIST;
833                 }
834
835                 return 0;
836         }
837
838         if (path_is_mount_point(q, false) > 0) {
839                 if (arg_link_journal != LINK_AUTO) {
840                         log_error("%s: already a mount point, refusing to use for journal", q);
841                         return -EEXIST;
842                 }
843
844                 return 0;
845         }
846
847         r = readlink_and_make_absolute(p, &d);
848         if (r >= 0) {
849                 if ((arg_link_journal == LINK_GUEST ||
850                      arg_link_journal == LINK_AUTO) &&
851                     path_equal(d, q)) {
852
853                         r = mkdir_p(q, 0755);
854                         if (r < 0)
855                                 log_warning("failed to create directory %s: %m", q);
856                         return 0;
857                 }
858
859                 if (unlink(p) < 0) {
860                         log_error("Failed to remove symlink %s: %m", p);
861                         return -errno;
862                 }
863         } else if (r == -EINVAL) {
864
865                 if (arg_link_journal == LINK_GUEST &&
866                     rmdir(p) < 0) {
867
868                         if (errno == ENOTDIR) {
869                                 log_error("%s already exists and is neither a symlink nor a directory", p);
870                                 return r;
871                         } else {
872                                 log_error("Failed to remove %s: %m", p);
873                                 return -errno;
874                         }
875                 }
876         } else if (r != -ENOENT) {
877                 log_error("readlink(%s) failed: %m", p);
878                 return r;
879         }
880
881         if (arg_link_journal == LINK_GUEST) {
882
883                 if (symlink(q, p) < 0) {
884                         log_error("Failed to symlink %s to %s: %m", q, p);
885                         return -errno;
886                 }
887
888                 r = mkdir_p(q, 0755);
889                 if (r < 0)
890                         log_warning("failed to create directory %s: %m", q);
891                 return 0;
892         }
893
894         if (arg_link_journal == LINK_HOST) {
895                 r = mkdir_p(p, 0755);
896                 if (r < 0) {
897                         log_error("Failed to create %s: %m", p);
898                         return r;
899                 }
900
901         } else if (access(p, F_OK) < 0)
902                 return 0;
903
904         if (dir_is_empty(q) == 0) {
905                 log_error("%s not empty.", q);
906                 return -ENOTEMPTY;
907         }
908
909         r = mkdir_p(q, 0755);
910         if (r < 0) {
911                 log_error("Failed to create %s: %m", q);
912                 return r;
913         }
914
915         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
916                 log_error("Failed to bind mount journal from host into guest: %m");
917                 return -errno;
918         }
919
920         return 0;
921 }
922
923 static int drop_capabilities(void) {
924         return capability_bounding_set_drop(~arg_retain, false);
925 }
926
927 static int process_pty(int master, pid_t pid, sigset_t *mask) {
928
929         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
930         size_t in_buffer_full = 0, out_buffer_full = 0;
931         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
932         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
933         int ep = -1, signal_fd = -1, r;
934         bool tried_orderly_shutdown = false;
935
936         assert(master >= 0);
937         assert(pid > 0);
938         assert(mask);
939
940         fd_nonblock(STDIN_FILENO, 1);
941         fd_nonblock(STDOUT_FILENO, 1);
942         fd_nonblock(master, 1);
943
944         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
945         if (signal_fd < 0) {
946                 log_error("signalfd(): %m");
947                 r = -errno;
948                 goto finish;
949         }
950
951         ep = epoll_create1(EPOLL_CLOEXEC);
952         if (ep < 0) {
953                 log_error("Failed to create epoll: %m");
954                 r = -errno;
955                 goto finish;
956         }
957
958         /* We read from STDIN only if this is actually a TTY,
959          * otherwise we assume non-interactivity. */
960         if (isatty(STDIN_FILENO)) {
961                 zero(stdin_ev);
962                 stdin_ev.events = EPOLLIN|EPOLLET;
963                 stdin_ev.data.fd = STDIN_FILENO;
964
965                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
966                         log_error("Failed to register STDIN in epoll: %m");
967                         r = -errno;
968                         goto finish;
969                 }
970         }
971
972         zero(stdout_ev);
973         stdout_ev.events = EPOLLOUT|EPOLLET;
974         stdout_ev.data.fd = STDOUT_FILENO;
975
976         zero(master_ev);
977         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
978         master_ev.data.fd = master;
979
980         zero(signal_ev);
981         signal_ev.events = EPOLLIN;
982         signal_ev.data.fd = signal_fd;
983
984         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
985                 if (errno != EPERM) {
986                         log_error("Failed to register stdout in epoll: %m");
987                         r = -errno;
988                         goto finish;
989                 }
990                 /* stdout without epoll support. Likely redirected to regular file. */
991                 stdout_writable = true;
992         }
993
994         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
995             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
996                 log_error("Failed to register fds in epoll: %m");
997                 r = -errno;
998                 goto finish;
999         }
1000
1001         for (;;) {
1002                 struct epoll_event ev[16];
1003                 ssize_t k;
1004                 int i, nfds;
1005
1006                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1007                 if (nfds < 0) {
1008
1009                         if (errno == EINTR || errno == EAGAIN)
1010                                 continue;
1011
1012                         log_error("epoll_wait(): %m");
1013                         r = -errno;
1014                         goto finish;
1015                 }
1016
1017                 assert(nfds >= 1);
1018
1019                 for (i = 0; i < nfds; i++) {
1020                         if (ev[i].data.fd == STDIN_FILENO) {
1021
1022                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1023                                         stdin_readable = true;
1024
1025                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1026
1027                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1028                                         stdout_writable = true;
1029
1030                         } else if (ev[i].data.fd == master) {
1031
1032                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1033                                         master_readable = true;
1034
1035                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1036                                         master_writable = true;
1037
1038                         } else if (ev[i].data.fd == signal_fd) {
1039                                 struct signalfd_siginfo sfsi;
1040                                 ssize_t n;
1041
1042                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1043                                 if (n != sizeof(sfsi)) {
1044
1045                                         if (n >= 0) {
1046                                                 log_error("Failed to read from signalfd: invalid block size");
1047                                                 r = -EIO;
1048                                                 goto finish;
1049                                         }
1050
1051                                         if (errno != EINTR && errno != EAGAIN) {
1052                                                 log_error("Failed to read from signalfd: %m");
1053                                                 r = -errno;
1054                                                 goto finish;
1055                                         }
1056                                 } else {
1057
1058                                         if (sfsi.ssi_signo == SIGWINCH) {
1059                                                 struct winsize ws;
1060
1061                                                 /* The window size changed, let's forward that. */
1062                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1063                                                         ioctl(master, TIOCSWINSZ, &ws);
1064                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1065
1066                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1067
1068                                                 /* This only works for systemd... */
1069                                                 tried_orderly_shutdown = true;
1070                                                 kill(pid, SIGRTMIN+3);
1071
1072                                         } else {
1073                                                 r = 0;
1074                                                 goto finish;
1075                                         }
1076                                 }
1077                         }
1078                 }
1079
1080                 while ((stdin_readable && in_buffer_full <= 0) ||
1081                        (master_writable && in_buffer_full > 0) ||
1082                        (master_readable && out_buffer_full <= 0) ||
1083                        (stdout_writable && out_buffer_full > 0)) {
1084
1085                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1086
1087                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1088                                 if (k < 0) {
1089
1090                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1091                                                 stdin_readable = false;
1092                                         else {
1093                                                 log_error("read(): %m");
1094                                                 r = -errno;
1095                                                 goto finish;
1096                                         }
1097                                 } else
1098                                         in_buffer_full += (size_t) k;
1099                         }
1100
1101                         if (master_writable && in_buffer_full > 0) {
1102
1103                                 k = write(master, in_buffer, in_buffer_full);
1104                                 if (k < 0) {
1105
1106                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1107                                                 master_writable = false;
1108                                         else {
1109                                                 log_error("write(): %m");
1110                                                 r = -errno;
1111                                                 goto finish;
1112                                         }
1113
1114                                 } else {
1115                                         assert(in_buffer_full >= (size_t) k);
1116                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1117                                         in_buffer_full -= k;
1118                                 }
1119                         }
1120
1121                         if (master_readable && out_buffer_full < LINE_MAX) {
1122
1123                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1124                                 if (k < 0) {
1125
1126                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1127                                                 master_readable = false;
1128                                         else {
1129                                                 log_error("read(): %m");
1130                                                 r = -errno;
1131                                                 goto finish;
1132                                         }
1133                                 }  else
1134                                         out_buffer_full += (size_t) k;
1135                         }
1136
1137                         if (stdout_writable && out_buffer_full > 0) {
1138
1139                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1140                                 if (k < 0) {
1141
1142                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1143                                                 stdout_writable = false;
1144                                         else {
1145                                                 log_error("write(): %m");
1146                                                 r = -errno;
1147                                                 goto finish;
1148                                         }
1149
1150                                 } else {
1151                                         assert(out_buffer_full >= (size_t) k);
1152                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1153                                         out_buffer_full -= k;
1154                                 }
1155                         }
1156                 }
1157         }
1158
1159 finish:
1160         if (ep >= 0)
1161                 close_nointr_nofail(ep);
1162
1163         if (signal_fd >= 0)
1164                 close_nointr_nofail(signal_fd);
1165
1166         return r;
1167 }
1168
1169 static int register_machine(void) {
1170         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1171         _cleanup_bus_unref_ sd_bus *bus = NULL;
1172         int r;
1173
1174         r = sd_bus_open_system(&bus);
1175         if (r < 0) {
1176                 log_error("Failed to open system bus: %s", strerror(-r));
1177                 return r;
1178         }
1179
1180         r = sd_bus_call_method(
1181                         bus,
1182                         "org.freedesktop.machine1",
1183                         "/org/freedesktop/machine1",
1184                         "org.freedesktop.machine1.Manager",
1185                         "CreateMachine",
1186                         &error,
1187                         NULL,
1188                         "sayssusa(sv)",
1189                         arg_machine,
1190                         SD_BUS_APPEND_ID128(arg_uuid),
1191                         "nspawn",
1192                         "container",
1193                         (uint32_t) 0,
1194                         strempty(arg_directory),
1195                         1, "Slice", "s", strempty(arg_slice));
1196         if (r < 0) {
1197                 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
1198                 return r;
1199         }
1200
1201         return 0;
1202 }
1203
1204 static bool audit_enabled(void) {
1205         int fd;
1206
1207         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1208         if (fd >= 0) {
1209                 close_nointr_nofail(fd);
1210                 return true;
1211         }
1212         return false;
1213 }
1214
1215 int main(int argc, char *argv[]) {
1216         pid_t pid = 0;
1217         int r = EXIT_FAILURE, k;
1218         _cleanup_close_ int master = -1;
1219         int n_fd_passed;
1220         const char *console = NULL;
1221         struct termios saved_attr, raw_attr;
1222         sigset_t mask;
1223         bool saved_attr_valid = false;
1224         struct winsize ws;
1225         int kmsg_socket_pair[2] = { -1, -1 };
1226         _cleanup_fdset_free_ FDSet *fds = NULL;
1227
1228         log_parse_environment();
1229         log_open();
1230
1231         k = parse_argv(argc, argv);
1232         if (k < 0)
1233                 goto finish;
1234         else if (k == 0) {
1235                 r = EXIT_SUCCESS;
1236                 goto finish;
1237         }
1238
1239         if (arg_directory) {
1240                 char *p;
1241
1242                 p = path_make_absolute_cwd(arg_directory);
1243                 free(arg_directory);
1244                 arg_directory = p;
1245         } else
1246                 arg_directory = get_current_dir_name();
1247
1248         if (!arg_directory) {
1249                 log_error("Failed to determine path, please use -D.");
1250                 goto finish;
1251         }
1252
1253         path_kill_slashes(arg_directory);
1254
1255         if (!arg_machine) {
1256                 arg_machine = strdup(path_get_file_name(arg_directory));
1257                 if (!arg_machine) {
1258                         log_oom();
1259                         goto finish;
1260                 }
1261
1262                 hostname_cleanup(arg_machine, false);
1263                 if (isempty(arg_machine)) {
1264                         log_error("Failed to determine machine name automatically, please use -M.");
1265                         goto finish;
1266                 }
1267         }
1268
1269         if (geteuid() != 0) {
1270                 log_error("Need to be root.");
1271                 goto finish;
1272         }
1273
1274         if (sd_booted() <= 0) {
1275                 log_error("Not running on a systemd system.");
1276                 goto finish;
1277         }
1278
1279         if (arg_boot && audit_enabled()) {
1280                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1281                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1282                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1283                 sleep(5);
1284         }
1285
1286         if (path_equal(arg_directory, "/")) {
1287                 log_error("Spawning container on root directory not supported.");
1288                 goto finish;
1289         }
1290
1291         if (path_is_os_tree(arg_directory) <= 0) {
1292                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1293                 goto finish;
1294         }
1295
1296         log_close();
1297         n_fd_passed = sd_listen_fds(false);
1298         if (n_fd_passed > 0) {
1299                 k = fdset_new_listen_fds(&fds, false);
1300                 if (k < 0) {
1301                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1302                         goto finish;
1303                 }
1304         }
1305         fdset_close_others(fds);
1306         log_open();
1307
1308         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1309         if (master < 0) {
1310                 log_error("Failed to acquire pseudo tty: %m");
1311                 goto finish;
1312         }
1313
1314         console = ptsname(master);
1315         if (!console) {
1316                 log_error("Failed to determine tty name: %m");
1317                 goto finish;
1318         }
1319
1320         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1321
1322         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1323                 ioctl(master, TIOCSWINSZ, &ws);
1324
1325         if (unlockpt(master) < 0) {
1326                 log_error("Failed to unlock tty: %m");
1327                 goto finish;
1328         }
1329
1330         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1331                 saved_attr_valid = true;
1332
1333                 raw_attr = saved_attr;
1334                 cfmakeraw(&raw_attr);
1335                 raw_attr.c_lflag &= ~ECHO;
1336         }
1337
1338         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1339                 log_error("Failed to create kmsg socket pair.");
1340                 goto finish;
1341         }
1342
1343         sd_notify(0, "READY=1");
1344
1345         assert_se(sigemptyset(&mask) == 0);
1346         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1347         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1348
1349         for (;;) {
1350                 siginfo_t status;
1351                 int pipefd[2], pipefd2[2];
1352
1353                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1354                         log_error("pipe2(): %m");
1355                         goto finish;
1356                 }
1357
1358                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1359                         log_error("pipe2(): %m");
1360                         close_pipe(pipefd);
1361                         goto finish;
1362                 }
1363
1364                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1365                 if (pid < 0) {
1366                         if (errno == EINVAL)
1367                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1368                         else
1369                                 log_error("clone() failed: %m");
1370
1371                         goto finish;
1372                 }
1373
1374                 if (pid == 0) {
1375                         /* child */
1376                         const char *home = NULL;
1377                         uid_t uid = (uid_t) -1;
1378                         gid_t gid = (gid_t) -1;
1379                         unsigned n_env = 2;
1380                         const char *envp[] = {
1381                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1382                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1383                                 NULL, /* TERM */
1384                                 NULL, /* HOME */
1385                                 NULL, /* USER */
1386                                 NULL, /* LOGNAME */
1387                                 NULL, /* container_uuid */
1388                                 NULL, /* LISTEN_FDS */
1389                                 NULL, /* LISTEN_PID */
1390                                 NULL
1391                         };
1392
1393                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1394                         if (envp[n_env])
1395                                 n_env ++;
1396
1397                         /* Wait for the parent process to log our PID */
1398                         close_nointr_nofail(pipefd[1]);
1399                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1400                         close_nointr_nofail(pipefd[0]);
1401
1402                         close_nointr_nofail(master);
1403                         master = -1;
1404
1405                         if (saved_attr_valid) {
1406                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1407                                         log_error("Failed to set terminal attributes: %m");
1408                                         goto child_fail;
1409                                 }
1410                         }
1411
1412                         close_nointr(STDIN_FILENO);
1413                         close_nointr(STDOUT_FILENO);
1414                         close_nointr(STDERR_FILENO);
1415
1416                         close_nointr_nofail(kmsg_socket_pair[0]);
1417                         kmsg_socket_pair[0] = -1;
1418
1419                         reset_all_signal_handlers();
1420
1421                         assert_se(sigemptyset(&mask) == 0);
1422                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1423
1424                         k = open_terminal(console, O_RDWR);
1425                         if (k != STDIN_FILENO) {
1426                                 if (k >= 0) {
1427                                         close_nointr_nofail(k);
1428                                         k = -EINVAL;
1429                                 }
1430
1431                                 log_error("Failed to open console: %s", strerror(-k));
1432                                 goto child_fail;
1433                         }
1434
1435                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1436                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1437                                 log_error("Failed to duplicate console: %m");
1438                                 goto child_fail;
1439                         }
1440
1441                         if (setsid() < 0) {
1442                                 log_error("setsid() failed: %m");
1443                                 goto child_fail;
1444                         }
1445
1446                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1447                                 log_error("PR_SET_PDEATHSIG failed: %m");
1448                                 goto child_fail;
1449                         }
1450
1451                         close_pipe(pipefd2);
1452
1453                         r = register_machine();
1454                         if (r < 0)
1455                                 goto finish;
1456
1457                         /* Mark everything as slave, so that we still
1458                          * receive mounts from the real root, but don't
1459                          * propagate mounts to the real root. */
1460                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1461                                 log_error("MS_SLAVE|MS_REC failed: %m");
1462                                 goto child_fail;
1463                         }
1464
1465                         /* Turn directory into bind mount */
1466                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1467                                 log_error("Failed to make bind mount.");
1468                                 goto child_fail;
1469                         }
1470
1471                         if (arg_read_only)
1472                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1473                                         log_error("Failed to make read-only.");
1474                                         goto child_fail;
1475                                 }
1476
1477                         if (mount_all(arg_directory) < 0)
1478                                 goto child_fail;
1479
1480                         if (copy_devnodes(arg_directory) < 0)
1481                                 goto child_fail;
1482
1483                         if (setup_ptmx(arg_directory) < 0)
1484                                 goto child_fail;
1485
1486                         dev_setup(arg_directory);
1487
1488                         if (setup_dev_console(arg_directory, console) < 0)
1489                                 goto child_fail;
1490
1491                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1492                                 goto child_fail;
1493
1494                         close_nointr_nofail(kmsg_socket_pair[1]);
1495                         kmsg_socket_pair[1] = -1;
1496
1497                         if (setup_boot_id(arg_directory) < 0)
1498                                 goto child_fail;
1499
1500                         if (setup_timezone(arg_directory) < 0)
1501                                 goto child_fail;
1502
1503                         if (setup_resolv_conf(arg_directory) < 0)
1504                                 goto child_fail;
1505
1506                         if (setup_journal(arg_directory) < 0)
1507                                 goto child_fail;
1508
1509                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1510                                 goto child_fail;
1511
1512                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1513                                 goto child_fail;
1514
1515                         if (chdir(arg_directory) < 0) {
1516                                 log_error("chdir(%s) failed: %m", arg_directory);
1517                                 goto child_fail;
1518                         }
1519
1520                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1521                                 log_error("mount(MS_MOVE) failed: %m");
1522                                 goto child_fail;
1523                         }
1524
1525                         if (chroot(".") < 0) {
1526                                 log_error("chroot() failed: %m");
1527                                 goto child_fail;
1528                         }
1529
1530                         if (chdir("/") < 0) {
1531                                 log_error("chdir() failed: %m");
1532                                 goto child_fail;
1533                         }
1534
1535                         umask(0022);
1536
1537                         loopback_setup();
1538
1539                         if (drop_capabilities() < 0) {
1540                                 log_error("drop_capabilities() failed: %m");
1541                                 goto child_fail;
1542                         }
1543
1544                         if (arg_user) {
1545
1546                                 /* Note that this resolves user names
1547                                  * inside the container, and hence
1548                                  * accesses the NSS modules from the
1549                                  * container and not the host. This is
1550                                  * a bit weird... */
1551
1552                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1553                                         log_error("get_user_creds() failed: %m");
1554                                         goto child_fail;
1555                                 }
1556
1557                                 if (mkdir_parents_label(home, 0775) < 0) {
1558                                         log_error("mkdir_parents_label() failed: %m");
1559                                         goto child_fail;
1560                                 }
1561
1562                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1563                                         log_error("mkdir_safe_label() failed: %m");
1564                                         goto child_fail;
1565                                 }
1566
1567                                 if (initgroups((const char*)arg_user, gid) < 0) {
1568                                         log_error("initgroups() failed: %m");
1569                                         goto child_fail;
1570                                 }
1571
1572                                 if (setresgid(gid, gid, gid) < 0) {
1573                                         log_error("setregid() failed: %m");
1574                                         goto child_fail;
1575                                 }
1576
1577                                 if (setresuid(uid, uid, uid) < 0) {
1578                                         log_error("setreuid() failed: %m");
1579                                         goto child_fail;
1580                                 }
1581                         } else {
1582                                 /* Reset everything fully to 0, just in case */
1583
1584                                 if (setgroups(0, NULL) < 0) {
1585                                         log_error("setgroups() failed: %m");
1586                                         goto child_fail;
1587                                 }
1588
1589                                 if (setresgid(0, 0, 0) < 0) {
1590                                         log_error("setregid() failed: %m");
1591                                         goto child_fail;
1592                                 }
1593
1594                                 if (setresuid(0, 0, 0) < 0) {
1595                                         log_error("setreuid() failed: %m");
1596                                         goto child_fail;
1597                                 }
1598                         }
1599
1600                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1601                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1602                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1603                                 log_oom();
1604                                 goto child_fail;
1605                         }
1606
1607                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1608                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1609                                         log_oom();
1610                                         goto child_fail;
1611                                 }
1612                         }
1613
1614                         if (fdset_size(fds) > 0) {
1615                                 k = fdset_cloexec(fds, false);
1616                                 if (k < 0) {
1617                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1618                                         goto child_fail;
1619                                 }
1620
1621                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1622                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1623                                         log_oom();
1624                                         goto child_fail;
1625                                 }
1626                         }
1627
1628                         setup_hostname();
1629
1630                         if (arg_boot) {
1631                                 char **a;
1632                                 size_t l;
1633
1634                                 /* Automatically search for the init system */
1635
1636                                 l = 1 + argc - optind;
1637                                 a = newa(char*, l + 1);
1638                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1639
1640                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1641                                 execve(a[0], a, (char**) envp);
1642
1643                                 a[0] = (char*) "/lib/systemd/systemd";
1644                                 execve(a[0], a, (char**) envp);
1645
1646                                 a[0] = (char*) "/sbin/init";
1647                                 execve(a[0], a, (char**) envp);
1648                         } else if (argc > optind)
1649                                 execvpe(argv[optind], argv + optind, (char**) envp);
1650                         else {
1651                                 chdir(home ? home : "/root");
1652                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1653                         }
1654
1655                         log_error("execv() failed: %m");
1656
1657                 child_fail:
1658                         _exit(EXIT_FAILURE);
1659                 }
1660
1661                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1662                 close_nointr_nofail(pipefd[0]);
1663                 close_nointr_nofail(pipefd[1]);
1664
1665                 /* Wait for the child process to establish cgroup hierarchy */
1666                 close_nointr_nofail(pipefd2[1]);
1667                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1668                 close_nointr_nofail(pipefd2[0]);
1669
1670                 fdset_free(fds);
1671                 fds = NULL;
1672
1673                 if (process_pty(master, pid, &mask) < 0)
1674                         goto finish;
1675
1676                 if (saved_attr_valid)
1677                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1678
1679                 k = wait_for_terminate(pid, &status);
1680                 if (k < 0) {
1681                         r = EXIT_FAILURE;
1682                         break;
1683                 }
1684
1685                 if (status.si_code == CLD_EXITED) {
1686                         r = status.si_status;
1687                         if (status.si_status != 0) {
1688                                 log_error("Container failed with error code %i.", status.si_status);
1689                                 break;
1690                         }
1691
1692                         log_debug("Container exited successfully.");
1693                         break;
1694                 } else if (status.si_code == CLD_KILLED &&
1695                            status.si_status == SIGINT) {
1696                         log_info("Container has been shut down.");
1697                         r = 0;
1698                         break;
1699                 } else if (status.si_code == CLD_KILLED &&
1700                            status.si_status == SIGHUP) {
1701                         log_info("Container is being rebooted.");
1702                         continue;
1703                 } else if (status.si_code == CLD_KILLED ||
1704                            status.si_code == CLD_DUMPED) {
1705
1706                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1707                         r = EXIT_FAILURE;
1708                         break;
1709                 } else {
1710                         log_error("Container failed due to unknown reason.");
1711                         r = EXIT_FAILURE;
1712                         break;
1713                 }
1714         }
1715
1716 finish:
1717         if (saved_attr_valid)
1718                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1719
1720         close_pipe(kmsg_socket_pair);
1721
1722         if (pid > 0)
1723                 kill(pid, SIGKILL);
1724
1725         free(arg_directory);
1726         free(arg_machine);
1727
1728         return r;
1729 }