chiark / gitweb /
timedated: use libsystemd-bus instead of libdbus for bus communication
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
45
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
48
49 #include "log.h"
50 #include "util.h"
51 #include "mkdir.h"
52 #include "macro.h"
53 #include "audit.h"
54 #include "missing.h"
55 #include "cgroup-util.h"
56 #include "strv.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
59 #include "sd-id128.h"
60 #include "dev-setup.h"
61 #include "fdset.h"
62 #include "build.h"
63 #include "fileio.h"
64 #include "bus-util.h"
65
66 #ifndef TTY_GID
67 #define TTY_GID 5
68 #endif
69
70 typedef enum LinkJournal {
71         LINK_NO,
72         LINK_AUTO,
73         LINK_HOST,
74         LINK_GUEST
75 } LinkJournal;
76
77 static char *arg_directory = NULL;
78 static char *arg_user = NULL;
79 static sd_id128_t arg_uuid = {};
80 static char *arg_machine = NULL;
81 static const char *arg_slice = NULL;
82 static bool arg_private_network = false;
83 static bool arg_read_only = false;
84 static bool arg_boot = false;
85 static LinkJournal arg_link_journal = LINK_AUTO;
86 static uint64_t arg_retain =
87         (1ULL << CAP_CHOWN) |
88         (1ULL << CAP_DAC_OVERRIDE) |
89         (1ULL << CAP_DAC_READ_SEARCH) |
90         (1ULL << CAP_FOWNER) |
91         (1ULL << CAP_FSETID) |
92         (1ULL << CAP_IPC_OWNER) |
93         (1ULL << CAP_KILL) |
94         (1ULL << CAP_LEASE) |
95         (1ULL << CAP_LINUX_IMMUTABLE) |
96         (1ULL << CAP_NET_BIND_SERVICE) |
97         (1ULL << CAP_NET_BROADCAST) |
98         (1ULL << CAP_NET_RAW) |
99         (1ULL << CAP_SETGID) |
100         (1ULL << CAP_SETFCAP) |
101         (1ULL << CAP_SETPCAP) |
102         (1ULL << CAP_SETUID) |
103         (1ULL << CAP_SYS_ADMIN) |
104         (1ULL << CAP_SYS_CHROOT) |
105         (1ULL << CAP_SYS_NICE) |
106         (1ULL << CAP_SYS_PTRACE) |
107         (1ULL << CAP_SYS_TTY_CONFIG) |
108         (1ULL << CAP_SYS_RESOURCE) |
109         (1ULL << CAP_SYS_BOOT) |
110         (1ULL << CAP_AUDIT_WRITE) |
111         (1ULL << CAP_AUDIT_CONTROL);
112 static char **arg_bind = NULL;
113 static char **arg_bind_ro = NULL;
114
115 static int help(void) {
116
117         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
118                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
119                "  -h --help                Show this help\n"
120                "     --version             Print version string\n"
121                "  -D --directory=NAME      Root directory for the container\n"
122                "  -b --boot                Boot up full system (i.e. invoke init)\n"
123                "  -u --user=USER           Run the command under specified user or uid\n"
124                "     --uuid=UUID           Set a specific machine UUID for the container\n"
125                "  -M --machine=NAME        Set the machine name for the container\n"
126                "  -S --slice=SLICE         Place the container in the specified slice\n"
127                "     --private-network     Disable network in container\n"
128                "     --read-only           Mount the root directory read-only\n"
129                "     --capability=CAP      In addition to the default, retain specified\n"
130                "                           capability\n"
131                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
132                "  -j                       Equivalent to --link-journal=host\n"
133                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
134                "                           the container\n"
135                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
136                program_invocation_short_name);
137
138         return 0;
139 }
140
141 static int parse_argv(int argc, char *argv[]) {
142
143         enum {
144                 ARG_VERSION = 0x100,
145                 ARG_PRIVATE_NETWORK,
146                 ARG_UUID,
147                 ARG_READ_ONLY,
148                 ARG_CAPABILITY,
149                 ARG_LINK_JOURNAL,
150                 ARG_BIND,
151                 ARG_BIND_RO
152         };
153
154         static const struct option options[] = {
155                 { "help",            no_argument,       NULL, 'h'                 },
156                 { "version",         no_argument,       NULL, ARG_VERSION         },
157                 { "directory",       required_argument, NULL, 'D'                 },
158                 { "user",            required_argument, NULL, 'u'                 },
159                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
160                 { "boot",            no_argument,       NULL, 'b'                 },
161                 { "uuid",            required_argument, NULL, ARG_UUID            },
162                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
163                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
164                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
165                 { "bind",            required_argument, NULL, ARG_BIND            },
166                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
167                 { "machine",         required_argument, NULL, 'M'                 },
168                 { "slice",           required_argument, NULL, 'S'                 },
169                 { NULL,              0,                 NULL, 0                   }
170         };
171
172         int c, r;
173
174         assert(argc >= 0);
175         assert(argv);
176
177         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
178
179                 switch (c) {
180
181                 case 'h':
182                         help();
183                         return 0;
184
185                 case ARG_VERSION:
186                         puts(PACKAGE_STRING);
187                         puts(SYSTEMD_FEATURES);
188                         return 0;
189
190                 case 'D':
191                         free(arg_directory);
192                         arg_directory = canonicalize_file_name(optarg);
193                         if (!arg_directory) {
194                                 log_error("Failed to canonicalize root directory.");
195                                 return -ENOMEM;
196                         }
197
198                         break;
199
200                 case 'u':
201                         free(arg_user);
202                         arg_user = strdup(optarg);
203                         if (!arg_user)
204                                 return log_oom();
205
206                         break;
207
208                 case ARG_PRIVATE_NETWORK:
209                         arg_private_network = true;
210                         break;
211
212                 case 'b':
213                         arg_boot = true;
214                         break;
215
216                 case ARG_UUID:
217                         r = sd_id128_from_string(optarg, &arg_uuid);
218                         if (r < 0) {
219                                 log_error("Invalid UUID: %s", optarg);
220                                 return r;
221                         }
222                         break;
223
224                 case 'S':
225                         arg_slice = strdup(optarg);
226                         break;
227
228                 case 'M':
229                         if (!hostname_is_valid(optarg)) {
230                                 log_error("Invalid machine name: %s", optarg);
231                                 return -EINVAL;
232                         }
233
234                         free(arg_machine);
235                         arg_machine = strdup(optarg);
236                         if (!arg_machine)
237                                 return log_oom();
238
239                         break;
240
241                 case ARG_READ_ONLY:
242                         arg_read_only = true;
243                         break;
244
245                 case ARG_CAPABILITY: {
246                         char *state, *word;
247                         size_t length;
248
249                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
250                                 cap_value_t cap;
251                                 char *t;
252
253                                 t = strndup(word, length);
254                                 if (!t)
255                                         return log_oom();
256
257                                 if (cap_from_name(t, &cap) < 0) {
258                                         log_error("Failed to parse capability %s.", t);
259                                         free(t);
260                                         return -EINVAL;
261                                 }
262
263                                 free(t);
264                                 arg_retain |= 1ULL << (uint64_t) cap;
265                         }
266
267                         break;
268                 }
269
270                 case 'j':
271                         arg_link_journal = LINK_GUEST;
272                         break;
273
274                 case ARG_LINK_JOURNAL:
275                         if (streq(optarg, "auto"))
276                                 arg_link_journal = LINK_AUTO;
277                         else if (streq(optarg, "no"))
278                                 arg_link_journal = LINK_NO;
279                         else if (streq(optarg, "guest"))
280                                 arg_link_journal = LINK_GUEST;
281                         else if (streq(optarg, "host"))
282                                 arg_link_journal = LINK_HOST;
283                         else {
284                                 log_error("Failed to parse link journal mode %s", optarg);
285                                 return -EINVAL;
286                         }
287
288                         break;
289
290                 case ARG_BIND:
291                 case ARG_BIND_RO: {
292                         _cleanup_free_ char *a = NULL, *b = NULL;
293                         char *e;
294                         char ***x;
295
296                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
297
298                         e = strchr(optarg, ':');
299                         if (e) {
300                                 a = strndup(optarg, e - optarg);
301                                 b = strdup(e + 1);
302                         } else {
303                                 a = strdup(optarg);
304                                 b = strdup(optarg);
305                         }
306
307                         if (!a || !b)
308                                 return log_oom();
309
310                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
311                                 log_error("Invalid bind mount specification: %s", optarg);
312                                 return -EINVAL;
313                         }
314
315                         r = strv_extend(x, a);
316                         if (r < 0)
317                                 return r;
318
319                         r = strv_extend(x, b);
320                         if (r < 0)
321                                 return r;
322
323                         break;
324                 }
325
326                 case '?':
327                         return -EINVAL;
328
329                 default:
330                         log_error("Unknown option code %c", c);
331                         return -EINVAL;
332                 }
333         }
334
335         return 1;
336 }
337
338 static int mount_all(const char *dest) {
339
340         typedef struct MountPoint {
341                 const char *what;
342                 const char *where;
343                 const char *type;
344                 const char *options;
345                 unsigned long flags;
346                 bool fatal;
347         } MountPoint;
348
349         static const MountPoint mount_table[] = {
350                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
351                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
352                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
353                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
354                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
355                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
356                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
357                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358 #ifdef HAVE_SELINUX
359                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
360                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
361 #endif
362         };
363
364         unsigned k;
365         int r = 0;
366
367         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
368                 _cleanup_free_ char *where = NULL;
369                 int t;
370
371                 where = strjoin(dest, "/", mount_table[k].where, NULL);
372                 if (!where)
373                         return log_oom();
374
375                 t = path_is_mount_point(where, true);
376                 if (t < 0) {
377                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
378
379                         if (r == 0)
380                                 r = t;
381
382                         continue;
383                 }
384
385                 /* Skip this entry if it is not a remount. */
386                 if (mount_table[k].what && t > 0)
387                         continue;
388
389                 mkdir_p(where, 0755);
390
391                 if (mount(mount_table[k].what,
392                           where,
393                           mount_table[k].type,
394                           mount_table[k].flags,
395                           mount_table[k].options) < 0 &&
396                     mount_table[k].fatal) {
397
398                         log_error("mount(%s) failed: %m", where);
399
400                         if (r == 0)
401                                 r = -errno;
402                 }
403         }
404
405         return r;
406 }
407
408 static int mount_binds(const char *dest, char **l, unsigned long flags) {
409         char **x, **y;
410
411         STRV_FOREACH_PAIR(x, y, l) {
412                 _cleanup_free_ char *where = NULL;
413                 struct stat source_st, dest_st;
414
415                 if (stat(*x, &source_st) < 0) {
416                         log_error("failed to stat %s: %m", *x);
417                         return -errno;
418                 }
419
420                 where = strjoin(dest, "/", *y, NULL);
421                 if (!where)
422                         return log_oom();
423
424                 if (stat(where, &dest_st) == 0) {
425                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
426                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
427                                                 *x, where);
428                                 return -EINVAL;
429                         }
430                 } else {
431                         /* Create the mount point, but be conservative -- refuse to create block
432                          * and char devices. */
433                         if (S_ISDIR(source_st.st_mode))
434                                 mkdir_p_label(where, 0755);
435                         else if (S_ISFIFO(source_st.st_mode))
436                                 mkfifo(where, 0644);
437                         else if (S_ISSOCK(source_st.st_mode))
438                                 mknod(where, 0644 | S_IFSOCK, 0);
439                         else if (S_ISREG(source_st.st_mode))
440                                 touch(where);
441                         else {
442                                 log_error("Refusing to create mountpoint for file: %s", *x);
443                                 return -ENOTSUP;
444                         }
445                 }
446
447                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
448                         log_error("mount(%s) failed: %m", where);
449                         return -errno;
450                 }
451
452                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
453                         log_error("mount(%s) failed: %m", where);
454                         return -errno;
455                 }
456         }
457
458         return 0;
459 }
460
461 static int setup_timezone(const char *dest) {
462         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
463         char *z, *y;
464         int r;
465
466         assert(dest);
467
468         /* Fix the timezone, if possible */
469         r = readlink_malloc("/etc/localtime", &p);
470         if (r < 0) {
471                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
472                 return 0;
473         }
474
475         z = path_startswith(p, "../usr/share/zoneinfo/");
476         if (!z)
477                 z = path_startswith(p, "/usr/share/zoneinfo/");
478         if (!z) {
479                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
480                 return 0;
481         }
482
483         where = strappend(dest, "/etc/localtime");
484         if (!where)
485                 return log_oom();
486
487         r = readlink_malloc(where, &q);
488         if (r >= 0) {
489                 y = path_startswith(q, "../usr/share/zoneinfo/");
490                 if (!y)
491                         y = path_startswith(q, "/usr/share/zoneinfo/");
492
493
494                 /* Already pointing to the right place? Then do nothing .. */
495                 if (y && streq(y, z))
496                         return 0;
497         }
498
499         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
500         if (!check)
501                 return log_oom();
502
503         if (access(check, F_OK) < 0) {
504                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
505                 return 0;
506         }
507
508         what = strappend("../usr/share/zoneinfo/", z);
509         if (!what)
510                 return log_oom();
511
512         unlink(where);
513         if (symlink(what, where) < 0) {
514                 log_error("Failed to correct timezone of container: %m");
515                 return 0;
516         }
517
518         return 0;
519 }
520
521 static int setup_resolv_conf(const char *dest) {
522         char _cleanup_free_ *where = NULL;
523
524         assert(dest);
525
526         if (arg_private_network)
527                 return 0;
528
529         /* Fix resolv.conf, if possible */
530         where = strappend(dest, "/etc/resolv.conf");
531         if (!where)
532                 return log_oom();
533
534         /* We don't really care for the results of this really. If it
535          * fails, it fails, but meh... */
536         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
537
538         return 0;
539 }
540
541 static int setup_boot_id(const char *dest) {
542         _cleanup_free_ char *from = NULL, *to = NULL;
543         sd_id128_t rnd;
544         char as_uuid[37];
545         int r;
546
547         assert(dest);
548
549         /* Generate a new randomized boot ID, so that each boot-up of
550          * the container gets a new one */
551
552         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
553         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
554         if (!from || !to)
555                 return log_oom();
556
557         r = sd_id128_randomize(&rnd);
558         if (r < 0) {
559                 log_error("Failed to generate random boot id: %s", strerror(-r));
560                 return r;
561         }
562
563         snprintf(as_uuid, sizeof(as_uuid),
564                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
565                  SD_ID128_FORMAT_VAL(rnd));
566         char_array_0(as_uuid);
567
568         r = write_string_file(from, as_uuid);
569         if (r < 0) {
570                 log_error("Failed to write boot id: %s", strerror(-r));
571                 return r;
572         }
573
574         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
575                 log_error("Failed to bind mount boot id: %m");
576                 r = -errno;
577         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
578                 log_warning("Failed to make boot id read-only: %m");
579
580         unlink(from);
581         return r;
582 }
583
584 static int copy_devnodes(const char *dest) {
585
586         static const char devnodes[] =
587                 "null\0"
588                 "zero\0"
589                 "full\0"
590                 "random\0"
591                 "urandom\0"
592                 "tty\0";
593
594         const char *d;
595         int r = 0;
596         _cleanup_umask_ mode_t u;
597
598         assert(dest);
599
600         u = umask(0000);
601
602         NULSTR_FOREACH(d, devnodes) {
603                 struct stat st;
604                 _cleanup_free_ char *from = NULL, *to = NULL;
605
606                 asprintf(&from, "/dev/%s", d);
607                 asprintf(&to, "%s/dev/%s", dest, d);
608
609                 if (!from || !to) {
610                         log_oom();
611
612                         if (r == 0)
613                                 r = -ENOMEM;
614
615                         break;
616                 }
617
618                 if (stat(from, &st) < 0) {
619
620                         if (errno != ENOENT) {
621                                 log_error("Failed to stat %s: %m", from);
622                                 if (r == 0)
623                                         r = -errno;
624                         }
625
626                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
627
628                         log_error("%s is not a char or block device, cannot copy", from);
629                         if (r == 0)
630                                 r = -EIO;
631
632                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
633
634                         log_error("mknod(%s) failed: %m", dest);
635                         if (r == 0)
636                                 r = -errno;
637                 }
638         }
639
640         return r;
641 }
642
643 static int setup_ptmx(const char *dest) {
644         _cleanup_free_ char *p = NULL;
645
646         p = strappend(dest, "/dev/ptmx");
647         if (!p)
648                 return log_oom();
649
650         if (symlink("pts/ptmx", p) < 0) {
651                 log_error("Failed to create /dev/ptmx symlink: %m");
652                 return -errno;
653         }
654
655         return 0;
656 }
657
658 static int setup_dev_console(const char *dest, const char *console) {
659         struct stat st;
660         _cleanup_free_ char *to = NULL;
661         int r;
662         _cleanup_umask_ mode_t u;
663
664         assert(dest);
665         assert(console);
666
667         u = umask(0000);
668
669         if (stat(console, &st) < 0) {
670                 log_error("Failed to stat %s: %m", console);
671                 return -errno;
672
673         } else if (!S_ISCHR(st.st_mode)) {
674                 log_error("/dev/console is not a char device");
675                 return -EIO;
676         }
677
678         r = chmod_and_chown(console, 0600, 0, 0);
679         if (r < 0) {
680                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
681                 return r;
682         }
683
684         if (asprintf(&to, "%s/dev/console", dest) < 0)
685                 return log_oom();
686
687         /* We need to bind mount the right tty to /dev/console since
688          * ptys can only exist on pts file systems. To have something
689          * to bind mount things on we create a device node first, that
690          * has the right major/minor (note that the major minor
691          * doesn't actually matter here, since we mount it over
692          * anyway). */
693
694         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
695                 log_error("mknod() for /dev/console failed: %m");
696                 return -errno;
697         }
698
699         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
700                 log_error("Bind mount for /dev/console failed: %m");
701                 return -errno;
702         }
703
704         return 0;
705 }
706
707 static int setup_kmsg(const char *dest, int kmsg_socket) {
708         _cleanup_free_ char *from = NULL, *to = NULL;
709         int r, fd, k;
710         _cleanup_umask_ mode_t u;
711         union {
712                 struct cmsghdr cmsghdr;
713                 uint8_t buf[CMSG_SPACE(sizeof(int))];
714         } control = {};
715         struct msghdr mh = {
716                 .msg_control = &control,
717                 .msg_controllen = sizeof(control),
718         };
719         struct cmsghdr *cmsg;
720
721         assert(dest);
722         assert(kmsg_socket >= 0);
723
724         u = umask(0000);
725
726         /* We create the kmsg FIFO as /dev/kmsg, but immediately
727          * delete it after bind mounting it to /proc/kmsg. While FIFOs
728          * on the reading side behave very similar to /proc/kmsg,
729          * their writing side behaves differently from /dev/kmsg in
730          * that writing blocks when nothing is reading. In order to
731          * avoid any problems with containers deadlocking due to this
732          * we simply make /dev/kmsg unavailable to the container. */
733         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
734             asprintf(&to, "%s/proc/kmsg", dest) < 0)
735                 return log_oom();
736
737         if (mkfifo(from, 0600) < 0) {
738                 log_error("mkfifo() for /dev/kmsg failed: %m");
739                 return -errno;
740         }
741
742         r = chmod_and_chown(from, 0600, 0, 0);
743         if (r < 0) {
744                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
745                 return r;
746         }
747
748         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
749                 log_error("Bind mount for /proc/kmsg failed: %m");
750                 return -errno;
751         }
752
753         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
754         if (fd < 0) {
755                 log_error("Failed to open fifo: %m");
756                 return -errno;
757         }
758
759         cmsg = CMSG_FIRSTHDR(&mh);
760         cmsg->cmsg_level = SOL_SOCKET;
761         cmsg->cmsg_type = SCM_RIGHTS;
762         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
763         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
764
765         mh.msg_controllen = cmsg->cmsg_len;
766
767         /* Store away the fd in the socket, so that it stays open as
768          * long as we run the child */
769         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
770         close_nointr_nofail(fd);
771
772         if (k < 0) {
773                 log_error("Failed to send FIFO fd: %m");
774                 return -errno;
775         }
776
777         /* And now make the FIFO unavailable as /dev/kmsg... */
778         unlink(from);
779         return 0;
780 }
781
782 static int setup_hostname(void) {
783
784         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
785                 return -errno;
786
787         return 0;
788 }
789
790 static int setup_journal(const char *directory) {
791         sd_id128_t machine_id;
792         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
793         char *id;
794         int r;
795
796         if (arg_link_journal == LINK_NO)
797                 return 0;
798
799         p = strappend(directory, "/etc/machine-id");
800         if (!p)
801                 return log_oom();
802
803         r = read_one_line_file(p, &b);
804         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
805                 return 0;
806         else if (r < 0) {
807                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
808                 return r;
809         }
810
811         id = strstrip(b);
812         if (isempty(id) && arg_link_journal == LINK_AUTO)
813                 return 0;
814
815         /* Verify validity */
816         r = sd_id128_from_string(id, &machine_id);
817         if (r < 0) {
818                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
819                 return r;
820         }
821
822         free(p);
823         p = strappend("/var/log/journal/", id);
824         q = strjoin(directory, "/var/log/journal/", id, NULL);
825         if (!p || !q)
826                 return log_oom();
827
828         if (path_is_mount_point(p, false) > 0) {
829                 if (arg_link_journal != LINK_AUTO) {
830                         log_error("%s: already a mount point, refusing to use for journal", p);
831                         return -EEXIST;
832                 }
833
834                 return 0;
835         }
836
837         if (path_is_mount_point(q, false) > 0) {
838                 if (arg_link_journal != LINK_AUTO) {
839                         log_error("%s: already a mount point, refusing to use for journal", q);
840                         return -EEXIST;
841                 }
842
843                 return 0;
844         }
845
846         r = readlink_and_make_absolute(p, &d);
847         if (r >= 0) {
848                 if ((arg_link_journal == LINK_GUEST ||
849                      arg_link_journal == LINK_AUTO) &&
850                     path_equal(d, q)) {
851
852                         r = mkdir_p(q, 0755);
853                         if (r < 0)
854                                 log_warning("failed to create directory %s: %m", q);
855                         return 0;
856                 }
857
858                 if (unlink(p) < 0) {
859                         log_error("Failed to remove symlink %s: %m", p);
860                         return -errno;
861                 }
862         } else if (r == -EINVAL) {
863
864                 if (arg_link_journal == LINK_GUEST &&
865                     rmdir(p) < 0) {
866
867                         if (errno == ENOTDIR) {
868                                 log_error("%s already exists and is neither a symlink nor a directory", p);
869                                 return r;
870                         } else {
871                                 log_error("Failed to remove %s: %m", p);
872                                 return -errno;
873                         }
874                 }
875         } else if (r != -ENOENT) {
876                 log_error("readlink(%s) failed: %m", p);
877                 return r;
878         }
879
880         if (arg_link_journal == LINK_GUEST) {
881
882                 if (symlink(q, p) < 0) {
883                         log_error("Failed to symlink %s to %s: %m", q, p);
884                         return -errno;
885                 }
886
887                 r = mkdir_p(q, 0755);
888                 if (r < 0)
889                         log_warning("failed to create directory %s: %m", q);
890                 return 0;
891         }
892
893         if (arg_link_journal == LINK_HOST) {
894                 r = mkdir_p(p, 0755);
895                 if (r < 0) {
896                         log_error("Failed to create %s: %m", p);
897                         return r;
898                 }
899
900         } else if (access(p, F_OK) < 0)
901                 return 0;
902
903         if (dir_is_empty(q) == 0) {
904                 log_error("%s not empty.", q);
905                 return -ENOTEMPTY;
906         }
907
908         r = mkdir_p(q, 0755);
909         if (r < 0) {
910                 log_error("Failed to create %s: %m", q);
911                 return r;
912         }
913
914         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
915                 log_error("Failed to bind mount journal from host into guest: %m");
916                 return -errno;
917         }
918
919         return 0;
920 }
921
922 static int drop_capabilities(void) {
923         return capability_bounding_set_drop(~arg_retain, false);
924 }
925
926 static int process_pty(int master, pid_t pid, sigset_t *mask) {
927
928         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
929         size_t in_buffer_full = 0, out_buffer_full = 0;
930         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
931         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
932         int ep = -1, signal_fd = -1, r;
933         bool tried_orderly_shutdown = false;
934
935         assert(master >= 0);
936         assert(pid > 0);
937         assert(mask);
938
939         fd_nonblock(STDIN_FILENO, 1);
940         fd_nonblock(STDOUT_FILENO, 1);
941         fd_nonblock(master, 1);
942
943         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
944         if (signal_fd < 0) {
945                 log_error("signalfd(): %m");
946                 r = -errno;
947                 goto finish;
948         }
949
950         ep = epoll_create1(EPOLL_CLOEXEC);
951         if (ep < 0) {
952                 log_error("Failed to create epoll: %m");
953                 r = -errno;
954                 goto finish;
955         }
956
957         /* We read from STDIN only if this is actually a TTY,
958          * otherwise we assume non-interactivity. */
959         if (isatty(STDIN_FILENO)) {
960                 zero(stdin_ev);
961                 stdin_ev.events = EPOLLIN|EPOLLET;
962                 stdin_ev.data.fd = STDIN_FILENO;
963
964                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
965                         log_error("Failed to register STDIN in epoll: %m");
966                         r = -errno;
967                         goto finish;
968                 }
969         }
970
971         zero(stdout_ev);
972         stdout_ev.events = EPOLLOUT|EPOLLET;
973         stdout_ev.data.fd = STDOUT_FILENO;
974
975         zero(master_ev);
976         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
977         master_ev.data.fd = master;
978
979         zero(signal_ev);
980         signal_ev.events = EPOLLIN;
981         signal_ev.data.fd = signal_fd;
982
983         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
984                 if (errno != EPERM) {
985                         log_error("Failed to register stdout in epoll: %m");
986                         r = -errno;
987                         goto finish;
988                 }
989                 /* stdout without epoll support. Likely redirected to regular file. */
990                 stdout_writable = true;
991         }
992
993         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
994             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
995                 log_error("Failed to register fds in epoll: %m");
996                 r = -errno;
997                 goto finish;
998         }
999
1000         for (;;) {
1001                 struct epoll_event ev[16];
1002                 ssize_t k;
1003                 int i, nfds;
1004
1005                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1006                 if (nfds < 0) {
1007
1008                         if (errno == EINTR || errno == EAGAIN)
1009                                 continue;
1010
1011                         log_error("epoll_wait(): %m");
1012                         r = -errno;
1013                         goto finish;
1014                 }
1015
1016                 assert(nfds >= 1);
1017
1018                 for (i = 0; i < nfds; i++) {
1019                         if (ev[i].data.fd == STDIN_FILENO) {
1020
1021                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1022                                         stdin_readable = true;
1023
1024                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1025
1026                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1027                                         stdout_writable = true;
1028
1029                         } else if (ev[i].data.fd == master) {
1030
1031                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1032                                         master_readable = true;
1033
1034                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1035                                         master_writable = true;
1036
1037                         } else if (ev[i].data.fd == signal_fd) {
1038                                 struct signalfd_siginfo sfsi;
1039                                 ssize_t n;
1040
1041                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1042                                 if (n != sizeof(sfsi)) {
1043
1044                                         if (n >= 0) {
1045                                                 log_error("Failed to read from signalfd: invalid block size");
1046                                                 r = -EIO;
1047                                                 goto finish;
1048                                         }
1049
1050                                         if (errno != EINTR && errno != EAGAIN) {
1051                                                 log_error("Failed to read from signalfd: %m");
1052                                                 r = -errno;
1053                                                 goto finish;
1054                                         }
1055                                 } else {
1056
1057                                         if (sfsi.ssi_signo == SIGWINCH) {
1058                                                 struct winsize ws;
1059
1060                                                 /* The window size changed, let's forward that. */
1061                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1062                                                         ioctl(master, TIOCSWINSZ, &ws);
1063                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1064
1065                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1066
1067                                                 /* This only works for systemd... */
1068                                                 tried_orderly_shutdown = true;
1069                                                 kill(pid, SIGRTMIN+3);
1070
1071                                         } else {
1072                                                 r = 0;
1073                                                 goto finish;
1074                                         }
1075                                 }
1076                         }
1077                 }
1078
1079                 while ((stdin_readable && in_buffer_full <= 0) ||
1080                        (master_writable && in_buffer_full > 0) ||
1081                        (master_readable && out_buffer_full <= 0) ||
1082                        (stdout_writable && out_buffer_full > 0)) {
1083
1084                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1085
1086                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1087                                 if (k < 0) {
1088
1089                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1090                                                 stdin_readable = false;
1091                                         else {
1092                                                 log_error("read(): %m");
1093                                                 r = -errno;
1094                                                 goto finish;
1095                                         }
1096                                 } else
1097                                         in_buffer_full += (size_t) k;
1098                         }
1099
1100                         if (master_writable && in_buffer_full > 0) {
1101
1102                                 k = write(master, in_buffer, in_buffer_full);
1103                                 if (k < 0) {
1104
1105                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1106                                                 master_writable = false;
1107                                         else {
1108                                                 log_error("write(): %m");
1109                                                 r = -errno;
1110                                                 goto finish;
1111                                         }
1112
1113                                 } else {
1114                                         assert(in_buffer_full >= (size_t) k);
1115                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1116                                         in_buffer_full -= k;
1117                                 }
1118                         }
1119
1120                         if (master_readable && out_buffer_full < LINE_MAX) {
1121
1122                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1123                                 if (k < 0) {
1124
1125                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1126                                                 master_readable = false;
1127                                         else {
1128                                                 log_error("read(): %m");
1129                                                 r = -errno;
1130                                                 goto finish;
1131                                         }
1132                                 }  else
1133                                         out_buffer_full += (size_t) k;
1134                         }
1135
1136                         if (stdout_writable && out_buffer_full > 0) {
1137
1138                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1139                                 if (k < 0) {
1140
1141                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1142                                                 stdout_writable = false;
1143                                         else {
1144                                                 log_error("write(): %m");
1145                                                 r = -errno;
1146                                                 goto finish;
1147                                         }
1148
1149                                 } else {
1150                                         assert(out_buffer_full >= (size_t) k);
1151                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1152                                         out_buffer_full -= k;
1153                                 }
1154                         }
1155                 }
1156         }
1157
1158 finish:
1159         if (ep >= 0)
1160                 close_nointr_nofail(ep);
1161
1162         if (signal_fd >= 0)
1163                 close_nointr_nofail(signal_fd);
1164
1165         return r;
1166 }
1167
1168 static int register_machine(void) {
1169         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1170         _cleanup_bus_unref_ sd_bus *bus = NULL;
1171         int r;
1172
1173         r = sd_bus_open_system(&bus);
1174         if (r < 0) {
1175                 log_error("Failed to open system bus: %s", strerror(-r));
1176                 return r;
1177         }
1178
1179         r = sd_bus_call_method(
1180                         bus,
1181                         "org.freedesktop.machine1",
1182                         "/org/freedesktop/machine1",
1183                         "org.freedesktop.machine1.Manager",
1184                         "CreateMachine",
1185                         &error,
1186                         NULL,
1187                         "sayssusa(sv)",
1188                         arg_machine,
1189                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1190                         "nspawn",
1191                         "container",
1192                         (uint32_t) 0,
1193                         strempty(arg_directory),
1194                         1, "Slice", "s", strempty(arg_slice));
1195         if (r < 0) {
1196                 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
1197                 return r;
1198         }
1199
1200         return 0;
1201 }
1202
1203 static bool audit_enabled(void) {
1204         int fd;
1205
1206         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1207         if (fd >= 0) {
1208                 close_nointr_nofail(fd);
1209                 return true;
1210         }
1211         return false;
1212 }
1213
1214 int main(int argc, char *argv[]) {
1215         pid_t pid = 0;
1216         int r = EXIT_FAILURE, k;
1217         _cleanup_close_ int master = -1;
1218         int n_fd_passed;
1219         const char *console = NULL;
1220         struct termios saved_attr, raw_attr;
1221         sigset_t mask;
1222         bool saved_attr_valid = false;
1223         struct winsize ws;
1224         int kmsg_socket_pair[2] = { -1, -1 };
1225         _cleanup_fdset_free_ FDSet *fds = NULL;
1226
1227         log_parse_environment();
1228         log_open();
1229
1230         k = parse_argv(argc, argv);
1231         if (k < 0)
1232                 goto finish;
1233         else if (k == 0) {
1234                 r = EXIT_SUCCESS;
1235                 goto finish;
1236         }
1237
1238         if (arg_directory) {
1239                 char *p;
1240
1241                 p = path_make_absolute_cwd(arg_directory);
1242                 free(arg_directory);
1243                 arg_directory = p;
1244         } else
1245                 arg_directory = get_current_dir_name();
1246
1247         if (!arg_directory) {
1248                 log_error("Failed to determine path, please use -D.");
1249                 goto finish;
1250         }
1251
1252         path_kill_slashes(arg_directory);
1253
1254         if (!arg_machine) {
1255                 arg_machine = strdup(path_get_file_name(arg_directory));
1256                 if (!arg_machine) {
1257                         log_oom();
1258                         goto finish;
1259                 }
1260
1261                 hostname_cleanup(arg_machine, false);
1262                 if (isempty(arg_machine)) {
1263                         log_error("Failed to determine machine name automatically, please use -M.");
1264                         goto finish;
1265                 }
1266         }
1267
1268         if (geteuid() != 0) {
1269                 log_error("Need to be root.");
1270                 goto finish;
1271         }
1272
1273         if (sd_booted() <= 0) {
1274                 log_error("Not running on a systemd system.");
1275                 goto finish;
1276         }
1277
1278         if (arg_boot && audit_enabled()) {
1279                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1280                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1281                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1282                 sleep(5);
1283         }
1284
1285         if (path_equal(arg_directory, "/")) {
1286                 log_error("Spawning container on root directory not supported.");
1287                 goto finish;
1288         }
1289
1290         if (path_is_os_tree(arg_directory) <= 0) {
1291                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1292                 goto finish;
1293         }
1294
1295         log_close();
1296         n_fd_passed = sd_listen_fds(false);
1297         if (n_fd_passed > 0) {
1298                 k = fdset_new_listen_fds(&fds, false);
1299                 if (k < 0) {
1300                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1301                         goto finish;
1302                 }
1303         }
1304         fdset_close_others(fds);
1305         log_open();
1306
1307         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1308         if (master < 0) {
1309                 log_error("Failed to acquire pseudo tty: %m");
1310                 goto finish;
1311         }
1312
1313         console = ptsname(master);
1314         if (!console) {
1315                 log_error("Failed to determine tty name: %m");
1316                 goto finish;
1317         }
1318
1319         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1320
1321         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1322                 ioctl(master, TIOCSWINSZ, &ws);
1323
1324         if (unlockpt(master) < 0) {
1325                 log_error("Failed to unlock tty: %m");
1326                 goto finish;
1327         }
1328
1329         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1330                 saved_attr_valid = true;
1331
1332                 raw_attr = saved_attr;
1333                 cfmakeraw(&raw_attr);
1334                 raw_attr.c_lflag &= ~ECHO;
1335         }
1336
1337         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1338                 log_error("Failed to create kmsg socket pair.");
1339                 goto finish;
1340         }
1341
1342         sd_notify(0, "READY=1");
1343
1344         assert_se(sigemptyset(&mask) == 0);
1345         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1346         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1347
1348         for (;;) {
1349                 siginfo_t status;
1350                 int pipefd[2], pipefd2[2];
1351
1352                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1353                         log_error("pipe2(): %m");
1354                         goto finish;
1355                 }
1356
1357                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1358                         log_error("pipe2(): %m");
1359                         close_pipe(pipefd);
1360                         goto finish;
1361                 }
1362
1363                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1364                 if (pid < 0) {
1365                         if (errno == EINVAL)
1366                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1367                         else
1368                                 log_error("clone() failed: %m");
1369
1370                         goto finish;
1371                 }
1372
1373                 if (pid == 0) {
1374                         /* child */
1375                         const char *home = NULL;
1376                         uid_t uid = (uid_t) -1;
1377                         gid_t gid = (gid_t) -1;
1378                         unsigned n_env = 2;
1379                         const char *envp[] = {
1380                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1381                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1382                                 NULL, /* TERM */
1383                                 NULL, /* HOME */
1384                                 NULL, /* USER */
1385                                 NULL, /* LOGNAME */
1386                                 NULL, /* container_uuid */
1387                                 NULL, /* LISTEN_FDS */
1388                                 NULL, /* LISTEN_PID */
1389                                 NULL
1390                         };
1391
1392                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1393                         if (envp[n_env])
1394                                 n_env ++;
1395
1396                         /* Wait for the parent process to log our PID */
1397                         close_nointr_nofail(pipefd[1]);
1398                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1399                         close_nointr_nofail(pipefd[0]);
1400
1401                         close_nointr_nofail(master);
1402                         master = -1;
1403
1404                         if (saved_attr_valid) {
1405                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1406                                         log_error("Failed to set terminal attributes: %m");
1407                                         goto child_fail;
1408                                 }
1409                         }
1410
1411                         close_nointr(STDIN_FILENO);
1412                         close_nointr(STDOUT_FILENO);
1413                         close_nointr(STDERR_FILENO);
1414
1415                         close_nointr_nofail(kmsg_socket_pair[0]);
1416                         kmsg_socket_pair[0] = -1;
1417
1418                         reset_all_signal_handlers();
1419
1420                         assert_se(sigemptyset(&mask) == 0);
1421                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1422
1423                         k = open_terminal(console, O_RDWR);
1424                         if (k != STDIN_FILENO) {
1425                                 if (k >= 0) {
1426                                         close_nointr_nofail(k);
1427                                         k = -EINVAL;
1428                                 }
1429
1430                                 log_error("Failed to open console: %s", strerror(-k));
1431                                 goto child_fail;
1432                         }
1433
1434                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1435                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1436                                 log_error("Failed to duplicate console: %m");
1437                                 goto child_fail;
1438                         }
1439
1440                         if (setsid() < 0) {
1441                                 log_error("setsid() failed: %m");
1442                                 goto child_fail;
1443                         }
1444
1445                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1446                                 log_error("PR_SET_PDEATHSIG failed: %m");
1447                                 goto child_fail;
1448                         }
1449
1450                         close_pipe(pipefd2);
1451
1452                         r = register_machine();
1453                         if (r < 0)
1454                                 goto finish;
1455
1456                         /* Mark everything as slave, so that we still
1457                          * receive mounts from the real root, but don't
1458                          * propagate mounts to the real root. */
1459                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1460                                 log_error("MS_SLAVE|MS_REC failed: %m");
1461                                 goto child_fail;
1462                         }
1463
1464                         /* Turn directory into bind mount */
1465                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1466                                 log_error("Failed to make bind mount.");
1467                                 goto child_fail;
1468                         }
1469
1470                         if (arg_read_only)
1471                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1472                                         log_error("Failed to make read-only.");
1473                                         goto child_fail;
1474                                 }
1475
1476                         if (mount_all(arg_directory) < 0)
1477                                 goto child_fail;
1478
1479                         if (copy_devnodes(arg_directory) < 0)
1480                                 goto child_fail;
1481
1482                         if (setup_ptmx(arg_directory) < 0)
1483                                 goto child_fail;
1484
1485                         dev_setup(arg_directory);
1486
1487                         if (setup_dev_console(arg_directory, console) < 0)
1488                                 goto child_fail;
1489
1490                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1491                                 goto child_fail;
1492
1493                         close_nointr_nofail(kmsg_socket_pair[1]);
1494                         kmsg_socket_pair[1] = -1;
1495
1496                         if (setup_boot_id(arg_directory) < 0)
1497                                 goto child_fail;
1498
1499                         if (setup_timezone(arg_directory) < 0)
1500                                 goto child_fail;
1501
1502                         if (setup_resolv_conf(arg_directory) < 0)
1503                                 goto child_fail;
1504
1505                         if (setup_journal(arg_directory) < 0)
1506                                 goto child_fail;
1507
1508                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1509                                 goto child_fail;
1510
1511                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1512                                 goto child_fail;
1513
1514                         if (chdir(arg_directory) < 0) {
1515                                 log_error("chdir(%s) failed: %m", arg_directory);
1516                                 goto child_fail;
1517                         }
1518
1519                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1520                                 log_error("mount(MS_MOVE) failed: %m");
1521                                 goto child_fail;
1522                         }
1523
1524                         if (chroot(".") < 0) {
1525                                 log_error("chroot() failed: %m");
1526                                 goto child_fail;
1527                         }
1528
1529                         if (chdir("/") < 0) {
1530                                 log_error("chdir() failed: %m");
1531                                 goto child_fail;
1532                         }
1533
1534                         umask(0022);
1535
1536                         loopback_setup();
1537
1538                         if (drop_capabilities() < 0) {
1539                                 log_error("drop_capabilities() failed: %m");
1540                                 goto child_fail;
1541                         }
1542
1543                         if (arg_user) {
1544
1545                                 /* Note that this resolves user names
1546                                  * inside the container, and hence
1547                                  * accesses the NSS modules from the
1548                                  * container and not the host. This is
1549                                  * a bit weird... */
1550
1551                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1552                                         log_error("get_user_creds() failed: %m");
1553                                         goto child_fail;
1554                                 }
1555
1556                                 if (mkdir_parents_label(home, 0775) < 0) {
1557                                         log_error("mkdir_parents_label() failed: %m");
1558                                         goto child_fail;
1559                                 }
1560
1561                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1562                                         log_error("mkdir_safe_label() failed: %m");
1563                                         goto child_fail;
1564                                 }
1565
1566                                 if (initgroups((const char*)arg_user, gid) < 0) {
1567                                         log_error("initgroups() failed: %m");
1568                                         goto child_fail;
1569                                 }
1570
1571                                 if (setresgid(gid, gid, gid) < 0) {
1572                                         log_error("setregid() failed: %m");
1573                                         goto child_fail;
1574                                 }
1575
1576                                 if (setresuid(uid, uid, uid) < 0) {
1577                                         log_error("setreuid() failed: %m");
1578                                         goto child_fail;
1579                                 }
1580                         } else {
1581                                 /* Reset everything fully to 0, just in case */
1582
1583                                 if (setgroups(0, NULL) < 0) {
1584                                         log_error("setgroups() failed: %m");
1585                                         goto child_fail;
1586                                 }
1587
1588                                 if (setresgid(0, 0, 0) < 0) {
1589                                         log_error("setregid() failed: %m");
1590                                         goto child_fail;
1591                                 }
1592
1593                                 if (setresuid(0, 0, 0) < 0) {
1594                                         log_error("setreuid() failed: %m");
1595                                         goto child_fail;
1596                                 }
1597                         }
1598
1599                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1600                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1601                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1602                                 log_oom();
1603                                 goto child_fail;
1604                         }
1605
1606                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1607                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1608                                         log_oom();
1609                                         goto child_fail;
1610                                 }
1611                         }
1612
1613                         if (fdset_size(fds) > 0) {
1614                                 k = fdset_cloexec(fds, false);
1615                                 if (k < 0) {
1616                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1617                                         goto child_fail;
1618                                 }
1619
1620                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1621                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1622                                         log_oom();
1623                                         goto child_fail;
1624                                 }
1625                         }
1626
1627                         setup_hostname();
1628
1629                         if (arg_boot) {
1630                                 char **a;
1631                                 size_t l;
1632
1633                                 /* Automatically search for the init system */
1634
1635                                 l = 1 + argc - optind;
1636                                 a = newa(char*, l + 1);
1637                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1638
1639                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1640                                 execve(a[0], a, (char**) envp);
1641
1642                                 a[0] = (char*) "/lib/systemd/systemd";
1643                                 execve(a[0], a, (char**) envp);
1644
1645                                 a[0] = (char*) "/sbin/init";
1646                                 execve(a[0], a, (char**) envp);
1647                         } else if (argc > optind)
1648                                 execvpe(argv[optind], argv + optind, (char**) envp);
1649                         else {
1650                                 chdir(home ? home : "/root");
1651                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1652                         }
1653
1654                         log_error("execv() failed: %m");
1655
1656                 child_fail:
1657                         _exit(EXIT_FAILURE);
1658                 }
1659
1660                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1661                 close_nointr_nofail(pipefd[0]);
1662                 close_nointr_nofail(pipefd[1]);
1663
1664                 /* Wait for the child process to establish cgroup hierarchy */
1665                 close_nointr_nofail(pipefd2[1]);
1666                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1667                 close_nointr_nofail(pipefd2[0]);
1668
1669                 fdset_free(fds);
1670                 fds = NULL;
1671
1672                 if (process_pty(master, pid, &mask) < 0)
1673                         goto finish;
1674
1675                 if (saved_attr_valid)
1676                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1677
1678                 k = wait_for_terminate(pid, &status);
1679                 if (k < 0) {
1680                         r = EXIT_FAILURE;
1681                         break;
1682                 }
1683
1684                 if (status.si_code == CLD_EXITED) {
1685                         r = status.si_status;
1686                         if (status.si_status != 0) {
1687                                 log_error("Container failed with error code %i.", status.si_status);
1688                                 break;
1689                         }
1690
1691                         log_debug("Container exited successfully.");
1692                         break;
1693                 } else if (status.si_code == CLD_KILLED &&
1694                            status.si_status == SIGINT) {
1695                         log_info("Container has been shut down.");
1696                         r = 0;
1697                         break;
1698                 } else if (status.si_code == CLD_KILLED &&
1699                            status.si_status == SIGHUP) {
1700                         log_info("Container is being rebooted.");
1701                         continue;
1702                 } else if (status.si_code == CLD_KILLED ||
1703                            status.si_code == CLD_DUMPED) {
1704
1705                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1706                         r = EXIT_FAILURE;
1707                         break;
1708                 } else {
1709                         log_error("Container failed due to unknown reason.");
1710                         r = EXIT_FAILURE;
1711                         break;
1712                 }
1713         }
1714
1715 finish:
1716         if (saved_attr_valid)
1717                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1718
1719         close_pipe(kmsg_socket_pair);
1720
1721         if (pid > 0)
1722                 kill(pid, SIGKILL);
1723
1724         free(arg_directory);
1725         free(arg_machine);
1726
1727         return r;
1728 }