chiark / gitweb /
759f9c1aef046729ced5bfdadac8ab3066dda296
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #ifdef HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #include "sd-daemon.h"
49 #include "sd-bus.h"
50 #include "sd-id128.h"
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
62 #include "fdset.h"
63 #include "build.h"
64 #include "fileio.h"
65 #include "bus-util.h"
66 #include "bus-error.h"
67 #include "ptyfwd.h"
68 #include "bus-kernel.h"
69 #include "env-util.h"
70 #include "def.h"
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
91         (1ULL << CAP_CHOWN) |
92         (1ULL << CAP_DAC_OVERRIDE) |
93         (1ULL << CAP_DAC_READ_SEARCH) |
94         (1ULL << CAP_FOWNER) |
95         (1ULL << CAP_FSETID) |
96         (1ULL << CAP_IPC_OWNER) |
97         (1ULL << CAP_KILL) |
98         (1ULL << CAP_LEASE) |
99         (1ULL << CAP_LINUX_IMMUTABLE) |
100         (1ULL << CAP_NET_BIND_SERVICE) |
101         (1ULL << CAP_NET_BROADCAST) |
102         (1ULL << CAP_NET_RAW) |
103         (1ULL << CAP_SETGID) |
104         (1ULL << CAP_SETFCAP) |
105         (1ULL << CAP_SETPCAP) |
106         (1ULL << CAP_SETUID) |
107         (1ULL << CAP_SYS_ADMIN) |
108         (1ULL << CAP_SYS_CHROOT) |
109         (1ULL << CAP_SYS_NICE) |
110         (1ULL << CAP_SYS_PTRACE) |
111         (1ULL << CAP_SYS_TTY_CONFIG) |
112         (1ULL << CAP_SYS_RESOURCE) |
113         (1ULL << CAP_SYS_BOOT) |
114         (1ULL << CAP_AUDIT_WRITE) |
115         (1ULL << CAP_AUDIT_CONTROL) |
116         (1ULL << CAP_MKNOD);
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121 static bool arg_share_system = false;
122
123 static int help(void) {
124
125         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
126                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
127                "  -h --help                 Show this help\n"
128                "     --version              Print version string\n"
129                "  -D --directory=NAME       Root directory for the container\n"
130                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
131                "  -u --user=USER            Run the command under specified user or uid\n"
132                "     --uuid=UUID            Set a specific machine UUID for the container\n"
133                "  -M --machine=NAME         Set the machine name for the container\n"
134                "  -S --slice=SLICE          Place the container in the specified slice\n"
135                "  -Z --selinux-context=SECLABEL\n"
136                "                            Set the SELinux security context to be used by\n"
137                "                            processes in the container\n"
138                "  -L --selinux-apifs-context=SECLABEL\n"
139                "                            Set the SELinux security context to be used by\n"
140                "                            API/tmpfs file systems in the container\n"
141                "     --private-network      Disable network in container\n"
142                "     --share-system         Share system namespaces with host\n"
143                "     --read-only            Mount the root directory read-only\n"
144                "     --capability=CAP       In addition to the default, retain specified\n"
145                "                            capability\n"
146                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
147                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
148                "  -j                        Equivalent to --link-journal=host\n"
149                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
150                "                            the container\n"
151                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
152                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
153                "  -q --quiet                Do not show status information\n",
154                program_invocation_short_name);
155
156         return 0;
157 }
158
159 static int parse_argv(int argc, char *argv[]) {
160
161         enum {
162                 ARG_VERSION = 0x100,
163                 ARG_PRIVATE_NETWORK,
164                 ARG_UUID,
165                 ARG_READ_ONLY,
166                 ARG_CAPABILITY,
167                 ARG_DROP_CAPABILITY,
168                 ARG_LINK_JOURNAL,
169                 ARG_BIND,
170                 ARG_BIND_RO,
171                 ARG_SETENV,
172                 ARG_SHARE_SYSTEM
173         };
174
175         static const struct option options[] = {
176                 { "help",                  no_argument,       NULL, 'h'                 },
177                 { "version",               no_argument,       NULL, ARG_VERSION         },
178                 { "directory",             required_argument, NULL, 'D'                 },
179                 { "user",                  required_argument, NULL, 'u'                 },
180                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK },
181                 { "boot",                  no_argument,       NULL, 'b'                 },
182                 { "uuid",                  required_argument, NULL, ARG_UUID            },
183                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY       },
184                 { "capability",            required_argument, NULL, ARG_CAPABILITY      },
185                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY },
186                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL    },
187                 { "bind",                  required_argument, NULL, ARG_BIND            },
188                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO         },
189                 { "machine",               required_argument, NULL, 'M'                 },
190                 { "slice",                 required_argument, NULL, 'S'                 },
191                 { "setenv",                required_argument, NULL, ARG_SETENV          },
192                 { "selinux-context",       required_argument, NULL, 'Z'                 },
193                 { "selinux-apifs-context", required_argument, NULL, 'L'                 },
194                 { "quiet",                 no_argument,       NULL, 'q'                 },
195                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM    },
196                 {}
197         };
198
199         int c, r;
200
201         assert(argc >= 0);
202         assert(argv);
203
204         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
205
206                 switch (c) {
207
208                 case 'h':
209                         return help();
210
211                 case ARG_VERSION:
212                         puts(PACKAGE_STRING);
213                         puts(SYSTEMD_FEATURES);
214                         return 0;
215
216                 case 'D':
217                         free(arg_directory);
218                         arg_directory = canonicalize_file_name(optarg);
219                         if (!arg_directory) {
220                                 log_error("Invalid root directory: %m");
221                                 return -ENOMEM;
222                         }
223
224                         break;
225
226                 case 'u':
227                         free(arg_user);
228                         arg_user = strdup(optarg);
229                         if (!arg_user)
230                                 return log_oom();
231
232                         break;
233
234                 case ARG_PRIVATE_NETWORK:
235                         arg_private_network = true;
236                         break;
237
238                 case 'b':
239                         arg_boot = true;
240                         break;
241
242                 case ARG_UUID:
243                         r = sd_id128_from_string(optarg, &arg_uuid);
244                         if (r < 0) {
245                                 log_error("Invalid UUID: %s", optarg);
246                                 return r;
247                         }
248                         break;
249
250                 case 'S':
251                         arg_slice = strdup(optarg);
252                         if (!arg_slice)
253                                 return log_oom();
254
255                         break;
256
257                 case 'M':
258                         if (!hostname_is_valid(optarg)) {
259                                 log_error("Invalid machine name: %s", optarg);
260                                 return -EINVAL;
261                         }
262
263                         free(arg_machine);
264                         arg_machine = strdup(optarg);
265                         if (!arg_machine)
266                                 return log_oom();
267
268                         break;
269
270                 case 'Z':
271                         arg_selinux_context = optarg;
272                         break;
273
274                 case 'L':
275                         arg_selinux_apifs_context = optarg;
276                         break;
277
278                 case ARG_READ_ONLY:
279                         arg_read_only = true;
280                         break;
281
282                 case ARG_CAPABILITY:
283                 case ARG_DROP_CAPABILITY: {
284                         char *state, *word;
285                         size_t length;
286
287                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
288                                 cap_value_t cap;
289                                 char *t;
290
291                                 t = strndup(word, length);
292                                 if (!t)
293                                         return log_oom();
294
295                                 if (cap_from_name(t, &cap) < 0) {
296                                         log_error("Failed to parse capability %s.", t);
297                                         free(t);
298                                         return -EINVAL;
299                                 }
300
301                                 free(t);
302
303                                 if (c == ARG_CAPABILITY)
304                                         arg_retain |= 1ULL << (uint64_t) cap;
305                                 else
306                                         arg_retain &= ~(1ULL << (uint64_t) cap);
307                         }
308
309                         break;
310                 }
311
312                 case 'j':
313                         arg_link_journal = LINK_GUEST;
314                         break;
315
316                 case ARG_LINK_JOURNAL:
317                         if (streq(optarg, "auto"))
318                                 arg_link_journal = LINK_AUTO;
319                         else if (streq(optarg, "no"))
320                                 arg_link_journal = LINK_NO;
321                         else if (streq(optarg, "guest"))
322                                 arg_link_journal = LINK_GUEST;
323                         else if (streq(optarg, "host"))
324                                 arg_link_journal = LINK_HOST;
325                         else {
326                                 log_error("Failed to parse link journal mode %s", optarg);
327                                 return -EINVAL;
328                         }
329
330                         break;
331
332                 case ARG_BIND:
333                 case ARG_BIND_RO: {
334                         _cleanup_free_ char *a = NULL, *b = NULL;
335                         char *e;
336                         char ***x;
337
338                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
339
340                         e = strchr(optarg, ':');
341                         if (e) {
342                                 a = strndup(optarg, e - optarg);
343                                 b = strdup(e + 1);
344                         } else {
345                                 a = strdup(optarg);
346                                 b = strdup(optarg);
347                         }
348
349                         if (!a || !b)
350                                 return log_oom();
351
352                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
353                                 log_error("Invalid bind mount specification: %s", optarg);
354                                 return -EINVAL;
355                         }
356
357                         r = strv_extend(x, a);
358                         if (r < 0)
359                                 return log_oom();
360
361                         r = strv_extend(x, b);
362                         if (r < 0)
363                                 return log_oom();
364
365                         break;
366                 }
367
368                 case ARG_SETENV: {
369                         char **n;
370
371                         if (!env_assignment_is_valid(optarg)) {
372                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
373                                 return -EINVAL;
374                         }
375
376                         n = strv_env_set(arg_setenv, optarg);
377                         if (!n)
378                                 return log_oom();
379
380                         strv_free(arg_setenv);
381                         arg_setenv = n;
382                         break;
383                 }
384
385                 case 'q':
386                         arg_quiet = true;
387                         break;
388
389                 case ARG_SHARE_SYSTEM:
390                         arg_share_system = true;
391                         break;
392
393                 case '?':
394                         return -EINVAL;
395
396                 default:
397                         assert_not_reached("Unhandled option");
398                 }
399         }
400
401         return 1;
402 }
403
404 static int mount_all(const char *dest) {
405
406         typedef struct MountPoint {
407                 const char *what;
408                 const char *where;
409                 const char *type;
410                 const char *options;
411                 unsigned long flags;
412                 bool fatal;
413         } MountPoint;
414
415         static const MountPoint mount_table[] = {
416                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
417                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
418                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
419                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
420                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
421                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
422                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
423                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
424 #ifdef HAVE_SELINUX
425                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
426                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
427 #endif
428         };
429
430         unsigned k;
431         int r = 0;
432
433         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
434                 _cleanup_free_ char *where = NULL;
435 #ifdef HAVE_SELINUX
436                 _cleanup_free_ char *options = NULL;
437 #endif
438                 const char *o;
439                 int t;
440
441                 where = strjoin(dest, "/", mount_table[k].where, NULL);
442                 if (!where)
443                         return log_oom();
444
445                 t = path_is_mount_point(where, true);
446                 if (t < 0) {
447                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
448
449                         if (r == 0)
450                                 r = t;
451
452                         continue;
453                 }
454
455                 /* Skip this entry if it is not a remount. */
456                 if (mount_table[k].what && t > 0)
457                         continue;
458
459                 mkdir_p(where, 0755);
460
461 #ifdef HAVE_SELINUX
462                 if (arg_selinux_apifs_context &&
463                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
464                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
465                         if (!options)
466                                 return log_oom();
467
468                         o = options;
469                 } else
470 #endif
471                         o = mount_table[k].options;
472
473
474                 if (mount(mount_table[k].what,
475                           where,
476                           mount_table[k].type,
477                           mount_table[k].flags,
478                           o) < 0 &&
479                     mount_table[k].fatal) {
480
481                         log_error("mount(%s) failed: %m", where);
482
483                         if (r == 0)
484                                 r = -errno;
485                 }
486         }
487
488         return r;
489 }
490
491 static int mount_binds(const char *dest, char **l, unsigned long flags) {
492         char **x, **y;
493
494         STRV_FOREACH_PAIR(x, y, l) {
495                 char *where;
496                 struct stat source_st, dest_st;
497                 int r;
498
499                 if (stat(*x, &source_st) < 0) {
500                         log_error("failed to stat %s: %m", *x);
501                         return -errno;
502                 }
503
504                 where = strappenda(dest, *y);
505                 r = stat(where, &dest_st);
506                 if (r == 0) {
507                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
508                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
509                                                 *x, where);
510                                 return -EINVAL;
511                         }
512                 } else if (errno == ENOENT) {
513                         r = mkdir_parents_label(where, 0755);
514                         if (r < 0) {
515                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
516                                 return r;
517                         }
518                 } else {
519                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
520                         return -errno;
521                 }
522                 /* Create the mount point, but be conservative -- refuse to create block
523                 * and char devices. */
524                 if (S_ISDIR(source_st.st_mode))
525                         mkdir_label(where, 0755);
526                 else if (S_ISFIFO(source_st.st_mode))
527                         mkfifo(where, 0644);
528                 else if (S_ISSOCK(source_st.st_mode))
529                         mknod(where, 0644 | S_IFSOCK, 0);
530                 else if (S_ISREG(source_st.st_mode))
531                         touch(where);
532                 else {
533                         log_error("Refusing to create mountpoint for file: %s", *x);
534                         return -ENOTSUP;
535                 }
536
537                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
538                         log_error("mount(%s) failed: %m", where);
539                         return -errno;
540                 }
541
542                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
543                         log_error("mount(%s) failed: %m", where);
544                         return -errno;
545                 }
546         }
547
548         return 0;
549 }
550
551 static int setup_timezone(const char *dest) {
552         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
553         char *z, *y;
554         int r;
555
556         assert(dest);
557
558         /* Fix the timezone, if possible */
559         r = readlink_malloc("/etc/localtime", &p);
560         if (r < 0) {
561                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
562                 return 0;
563         }
564
565         z = path_startswith(p, "../usr/share/zoneinfo/");
566         if (!z)
567                 z = path_startswith(p, "/usr/share/zoneinfo/");
568         if (!z) {
569                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
570                 return 0;
571         }
572
573         where = strappend(dest, "/etc/localtime");
574         if (!where)
575                 return log_oom();
576
577         r = readlink_malloc(where, &q);
578         if (r >= 0) {
579                 y = path_startswith(q, "../usr/share/zoneinfo/");
580                 if (!y)
581                         y = path_startswith(q, "/usr/share/zoneinfo/");
582
583
584                 /* Already pointing to the right place? Then do nothing .. */
585                 if (y && streq(y, z))
586                         return 0;
587         }
588
589         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
590         if (!check)
591                 return log_oom();
592
593         if (access(check, F_OK) < 0) {
594                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
595                 return 0;
596         }
597
598         what = strappend("../usr/share/zoneinfo/", z);
599         if (!what)
600                 return log_oom();
601
602         unlink(where);
603         if (symlink(what, where) < 0) {
604                 log_error("Failed to correct timezone of container: %m");
605                 return 0;
606         }
607
608         return 0;
609 }
610
611 static int setup_resolv_conf(const char *dest) {
612         char _cleanup_free_ *where = NULL;
613
614         assert(dest);
615
616         if (arg_private_network)
617                 return 0;
618
619         /* Fix resolv.conf, if possible */
620         where = strappend(dest, "/etc/resolv.conf");
621         if (!where)
622                 return log_oom();
623
624         /* We don't really care for the results of this really. If it
625          * fails, it fails, but meh... */
626         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
627
628         return 0;
629 }
630
631 static int setup_boot_id(const char *dest) {
632         _cleanup_free_ char *from = NULL, *to = NULL;
633         sd_id128_t rnd;
634         char as_uuid[37];
635         int r;
636
637         assert(dest);
638
639         /* Generate a new randomized boot ID, so that each boot-up of
640          * the container gets a new one */
641
642         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
643         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
644         if (!from || !to)
645                 return log_oom();
646
647         r = sd_id128_randomize(&rnd);
648         if (r < 0) {
649                 log_error("Failed to generate random boot id: %s", strerror(-r));
650                 return r;
651         }
652
653         snprintf(as_uuid, sizeof(as_uuid),
654                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
655                  SD_ID128_FORMAT_VAL(rnd));
656         char_array_0(as_uuid);
657
658         r = write_string_file(from, as_uuid);
659         if (r < 0) {
660                 log_error("Failed to write boot id: %s", strerror(-r));
661                 return r;
662         }
663
664         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
665                 log_error("Failed to bind mount boot id: %m");
666                 r = -errno;
667         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
668                 log_warning("Failed to make boot id read-only: %m");
669
670         unlink(from);
671         return r;
672 }
673
674 static int copy_devnodes(const char *dest) {
675
676         static const char devnodes[] =
677                 "null\0"
678                 "zero\0"
679                 "full\0"
680                 "random\0"
681                 "urandom\0"
682                 "tty\0";
683
684         const char *d;
685         int r = 0;
686         _cleanup_umask_ mode_t u;
687
688         assert(dest);
689
690         u = umask(0000);
691
692         NULSTR_FOREACH(d, devnodes) {
693                 _cleanup_free_ char *from = NULL, *to = NULL;
694                 struct stat st;
695
696                 from = strappend("/dev/", d);
697                 to = strjoin(dest, "/dev/", d, NULL);
698                 if (!from || !to)
699                         return log_oom();
700
701                 if (stat(from, &st) < 0) {
702
703                         if (errno != ENOENT) {
704                                 log_error("Failed to stat %s: %m", from);
705                                 return -errno;
706                         }
707
708                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
709
710                         log_error("%s is not a char or block device, cannot copy", from);
711                         return -EIO;
712
713                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
714
715                         log_error("mknod(%s) failed: %m", dest);
716                         return  -errno;
717                 }
718         }
719
720         return r;
721 }
722
723 static int setup_ptmx(const char *dest) {
724         _cleanup_free_ char *p = NULL;
725
726         p = strappend(dest, "/dev/ptmx");
727         if (!p)
728                 return log_oom();
729
730         if (symlink("pts/ptmx", p) < 0) {
731                 log_error("Failed to create /dev/ptmx symlink: %m");
732                 return -errno;
733         }
734
735         return 0;
736 }
737
738 static int setup_dev_console(const char *dest, const char *console) {
739         struct stat st;
740         _cleanup_free_ char *to = NULL;
741         int r;
742         _cleanup_umask_ mode_t u;
743
744         assert(dest);
745         assert(console);
746
747         u = umask(0000);
748
749         if (stat(console, &st) < 0) {
750                 log_error("Failed to stat %s: %m", console);
751                 return -errno;
752
753         } else if (!S_ISCHR(st.st_mode)) {
754                 log_error("/dev/console is not a char device");
755                 return -EIO;
756         }
757
758         r = chmod_and_chown(console, 0600, 0, 0);
759         if (r < 0) {
760                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
761                 return r;
762         }
763
764         if (asprintf(&to, "%s/dev/console", dest) < 0)
765                 return log_oom();
766
767         /* We need to bind mount the right tty to /dev/console since
768          * ptys can only exist on pts file systems. To have something
769          * to bind mount things on we create a device node first, that
770          * has the right major/minor (note that the major minor
771          * doesn't actually matter here, since we mount it over
772          * anyway). */
773
774         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
775                 log_error("mknod() for /dev/console failed: %m");
776                 return -errno;
777         }
778
779         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
780                 log_error("Bind mount for /dev/console failed: %m");
781                 return -errno;
782         }
783
784         return 0;
785 }
786
787 static int setup_kmsg(const char *dest, int kmsg_socket) {
788         _cleanup_free_ char *from = NULL, *to = NULL;
789         int r, fd, k;
790         _cleanup_umask_ mode_t u;
791         union {
792                 struct cmsghdr cmsghdr;
793                 uint8_t buf[CMSG_SPACE(sizeof(int))];
794         } control = {};
795         struct msghdr mh = {
796                 .msg_control = &control,
797                 .msg_controllen = sizeof(control),
798         };
799         struct cmsghdr *cmsg;
800
801         assert(dest);
802         assert(kmsg_socket >= 0);
803
804         u = umask(0000);
805
806         /* We create the kmsg FIFO as /dev/kmsg, but immediately
807          * delete it after bind mounting it to /proc/kmsg. While FIFOs
808          * on the reading side behave very similar to /proc/kmsg,
809          * their writing side behaves differently from /dev/kmsg in
810          * that writing blocks when nothing is reading. In order to
811          * avoid any problems with containers deadlocking due to this
812          * we simply make /dev/kmsg unavailable to the container. */
813         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
814             asprintf(&to, "%s/proc/kmsg", dest) < 0)
815                 return log_oom();
816
817         if (mkfifo(from, 0600) < 0) {
818                 log_error("mkfifo() for /dev/kmsg failed: %m");
819                 return -errno;
820         }
821
822         r = chmod_and_chown(from, 0600, 0, 0);
823         if (r < 0) {
824                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
825                 return r;
826         }
827
828         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
829                 log_error("Bind mount for /proc/kmsg failed: %m");
830                 return -errno;
831         }
832
833         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
834         if (fd < 0) {
835                 log_error("Failed to open fifo: %m");
836                 return -errno;
837         }
838
839         cmsg = CMSG_FIRSTHDR(&mh);
840         cmsg->cmsg_level = SOL_SOCKET;
841         cmsg->cmsg_type = SCM_RIGHTS;
842         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
843         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
844
845         mh.msg_controllen = cmsg->cmsg_len;
846
847         /* Store away the fd in the socket, so that it stays open as
848          * long as we run the child */
849         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
850         close_nointr_nofail(fd);
851
852         if (k < 0) {
853                 log_error("Failed to send FIFO fd: %m");
854                 return -errno;
855         }
856
857         /* And now make the FIFO unavailable as /dev/kmsg... */
858         unlink(from);
859         return 0;
860 }
861
862 static int setup_hostname(void) {
863
864         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
865                 return -errno;
866
867         return 0;
868 }
869
870 static int setup_journal(const char *directory) {
871         sd_id128_t machine_id, this_id;
872         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
873         char *id;
874         int r;
875
876         p = strappend(directory, "/etc/machine-id");
877         if (!p)
878                 return log_oom();
879
880         r = read_one_line_file(p, &b);
881         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
882                 return 0;
883         else if (r < 0) {
884                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
885                 return r;
886         }
887
888         id = strstrip(b);
889         if (isempty(id) && arg_link_journal == LINK_AUTO)
890                 return 0;
891
892         /* Verify validity */
893         r = sd_id128_from_string(id, &machine_id);
894         if (r < 0) {
895                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
896                 return r;
897         }
898
899         r = sd_id128_get_machine(&this_id);
900         if (r < 0) {
901                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
902                 return r;
903         }
904
905         if (sd_id128_equal(machine_id, this_id)) {
906                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
907                          "Host and machine ids are equal (%s): refusing to link journals", id);
908                 if (arg_link_journal == LINK_AUTO)
909                         return 0;
910                 return
911                         -EEXIST;
912         }
913
914         if (arg_link_journal == LINK_NO)
915                 return 0;
916
917         free(p);
918         p = strappend("/var/log/journal/", id);
919         q = strjoin(directory, "/var/log/journal/", id, NULL);
920         if (!p || !q)
921                 return log_oom();
922
923         if (path_is_mount_point(p, false) > 0) {
924                 if (arg_link_journal != LINK_AUTO) {
925                         log_error("%s: already a mount point, refusing to use for journal", p);
926                         return -EEXIST;
927                 }
928
929                 return 0;
930         }
931
932         if (path_is_mount_point(q, false) > 0) {
933                 if (arg_link_journal != LINK_AUTO) {
934                         log_error("%s: already a mount point, refusing to use for journal", q);
935                         return -EEXIST;
936                 }
937
938                 return 0;
939         }
940
941         r = readlink_and_make_absolute(p, &d);
942         if (r >= 0) {
943                 if ((arg_link_journal == LINK_GUEST ||
944                      arg_link_journal == LINK_AUTO) &&
945                     path_equal(d, q)) {
946
947                         r = mkdir_p(q, 0755);
948                         if (r < 0)
949                                 log_warning("failed to create directory %s: %m", q);
950                         return 0;
951                 }
952
953                 if (unlink(p) < 0) {
954                         log_error("Failed to remove symlink %s: %m", p);
955                         return -errno;
956                 }
957         } else if (r == -EINVAL) {
958
959                 if (arg_link_journal == LINK_GUEST &&
960                     rmdir(p) < 0) {
961
962                         if (errno == ENOTDIR) {
963                                 log_error("%s already exists and is neither a symlink nor a directory", p);
964                                 return r;
965                         } else {
966                                 log_error("Failed to remove %s: %m", p);
967                                 return -errno;
968                         }
969                 }
970         } else if (r != -ENOENT) {
971                 log_error("readlink(%s) failed: %m", p);
972                 return r;
973         }
974
975         if (arg_link_journal == LINK_GUEST) {
976
977                 if (symlink(q, p) < 0) {
978                         log_error("Failed to symlink %s to %s: %m", q, p);
979                         return -errno;
980                 }
981
982                 r = mkdir_p(q, 0755);
983                 if (r < 0)
984                         log_warning("failed to create directory %s: %m", q);
985                 return 0;
986         }
987
988         if (arg_link_journal == LINK_HOST) {
989                 r = mkdir_p(p, 0755);
990                 if (r < 0) {
991                         log_error("Failed to create %s: %m", p);
992                         return r;
993                 }
994
995         } else if (access(p, F_OK) < 0)
996                 return 0;
997
998         if (dir_is_empty(q) == 0) {
999                 log_error("%s not empty.", q);
1000                 return -ENOTEMPTY;
1001         }
1002
1003         r = mkdir_p(q, 0755);
1004         if (r < 0) {
1005                 log_error("Failed to create %s: %m", q);
1006                 return r;
1007         }
1008
1009         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1010                 log_error("Failed to bind mount journal from host into guest: %m");
1011                 return -errno;
1012         }
1013
1014         return 0;
1015 }
1016
1017 static int setup_kdbus(const char *dest, const char *path) {
1018         const char *p;
1019
1020         if (!path)
1021                 return 0;
1022
1023         p = strappenda(dest, "/dev/kdbus");
1024         if (mkdir(p, 0755) < 0) {
1025                 log_error("Failed to create kdbus path: %m");
1026                 return  -errno;
1027         }
1028
1029         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1030                 log_error("Failed to mount kdbus domain path: %m");
1031                 return -errno;
1032         }
1033
1034         return 0;
1035 }
1036
1037 static int drop_capabilities(void) {
1038         return capability_bounding_set_drop(~arg_retain, false);
1039 }
1040
1041 static int register_machine(pid_t pid) {
1042         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1043         _cleanup_bus_unref_ sd_bus *bus = NULL;
1044         int r;
1045
1046         r = sd_bus_default_system(&bus);
1047         if (r < 0) {
1048                 log_error("Failed to open system bus: %s", strerror(-r));
1049                 return r;
1050         }
1051
1052         r = sd_bus_call_method(
1053                         bus,
1054                         "org.freedesktop.machine1",
1055                         "/org/freedesktop/machine1",
1056                         "org.freedesktop.machine1.Manager",
1057                         "CreateMachine",
1058                         &error,
1059                         NULL,
1060                         "sayssusa(sv)",
1061                         arg_machine,
1062                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1063                         "nspawn",
1064                         "container",
1065                         (uint32_t) pid,
1066                         strempty(arg_directory),
1067                         !isempty(arg_slice), "Slice", "s", arg_slice);
1068         if (r < 0) {
1069                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1070                 return r;
1071         }
1072
1073         return 0;
1074 }
1075
1076 static int terminate_machine(pid_t pid) {
1077         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1078         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1079         _cleanup_bus_unref_ sd_bus *bus = NULL;
1080         const char *path;
1081         int r;
1082
1083         r = sd_bus_default_system(&bus);
1084         if (r < 0) {
1085                 log_error("Failed to open system bus: %s", strerror(-r));
1086                 return r;
1087         }
1088
1089         r = sd_bus_call_method(
1090                         bus,
1091                         "org.freedesktop.machine1",
1092                         "/org/freedesktop/machine1",
1093                         "org.freedesktop.machine1.Manager",
1094                         "GetMachineByPID",
1095                         &error,
1096                         &reply,
1097                         "u",
1098                         (uint32_t) pid);
1099         if (r < 0) {
1100                 /* Note that the machine might already have been
1101                  * cleaned up automatically, hence don't consider it a
1102                  * failure if we cannot get the machine object. */
1103                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1104                 return 0;
1105         }
1106
1107         r = sd_bus_message_read(reply, "o", &path);
1108         if (r < 0)
1109                 return bus_log_parse_error(r);
1110
1111         r = sd_bus_call_method(
1112                         bus,
1113                         "org.freedesktop.machine1",
1114                         path,
1115                         "org.freedesktop.machine1.Machine",
1116                         "Terminate",
1117                         &error,
1118                         NULL,
1119                         NULL);
1120         if (r < 0) {
1121                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1122                 return 0;
1123         }
1124
1125         return 0;
1126 }
1127
1128 static bool audit_enabled(void) {
1129         int fd;
1130
1131         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1132         if (fd >= 0) {
1133                 close_nointr_nofail(fd);
1134                 return true;
1135         }
1136         return false;
1137 }
1138
1139 int main(int argc, char *argv[]) {
1140         pid_t pid = 0;
1141         int r = EXIT_FAILURE, k;
1142         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1143         int n_fd_passed;
1144         const char *console = NULL;
1145         sigset_t mask;
1146         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1147         _cleanup_fdset_free_ FDSet *fds = NULL;
1148         _cleanup_free_ char *kdbus_domain = NULL;
1149         const char *ns;
1150
1151         log_parse_environment();
1152         log_open();
1153
1154         k = parse_argv(argc, argv);
1155         if (k < 0)
1156                 goto finish;
1157         else if (k == 0) {
1158                 r = EXIT_SUCCESS;
1159                 goto finish;
1160         }
1161
1162         if (arg_directory) {
1163                 char *p;
1164
1165                 p = path_make_absolute_cwd(arg_directory);
1166                 free(arg_directory);
1167                 arg_directory = p;
1168         } else
1169                 arg_directory = get_current_dir_name();
1170
1171         if (!arg_directory) {
1172                 log_error("Failed to determine path, please use -D.");
1173                 goto finish;
1174         }
1175
1176         path_kill_slashes(arg_directory);
1177
1178         if (!arg_machine) {
1179                 arg_machine = strdup(basename(arg_directory));
1180                 if (!arg_machine) {
1181                         log_oom();
1182                         goto finish;
1183                 }
1184
1185                 hostname_cleanup(arg_machine, false);
1186                 if (isempty(arg_machine)) {
1187                         log_error("Failed to determine machine name automatically, please use -M.");
1188                         goto finish;
1189                 }
1190         }
1191
1192         if (geteuid() != 0) {
1193                 log_error("Need to be root.");
1194                 goto finish;
1195         }
1196
1197         if (sd_booted() <= 0) {
1198                 log_error("Not running on a systemd system.");
1199                 goto finish;
1200         }
1201
1202         if (arg_boot && audit_enabled()) {
1203                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1204                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1205                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1206                 sleep(5);
1207         }
1208
1209         if (path_equal(arg_directory, "/")) {
1210                 log_error("Spawning container on root directory not supported.");
1211                 goto finish;
1212         }
1213
1214         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1215                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1216                 goto finish;
1217         }
1218
1219         log_close();
1220         n_fd_passed = sd_listen_fds(false);
1221         if (n_fd_passed > 0) {
1222                 k = fdset_new_listen_fds(&fds, false);
1223                 if (k < 0) {
1224                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1225                         goto finish;
1226                 }
1227         }
1228         fdset_close_others(fds);
1229         log_open();
1230
1231         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1232         if (master < 0) {
1233                 log_error("Failed to acquire pseudo tty: %m");
1234                 goto finish;
1235         }
1236
1237         console = ptsname(master);
1238         if (!console) {
1239                 log_error("Failed to determine tty name: %m");
1240                 goto finish;
1241         }
1242
1243         if (!arg_quiet)
1244                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1245
1246         if (unlockpt(master) < 0) {
1247                 log_error("Failed to unlock tty: %m");
1248                 goto finish;
1249         }
1250
1251         ns = strappenda("machine-", arg_machine);
1252         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1253         if (r < 0)
1254                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1255         else
1256                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1257
1258         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1259                 log_error("Failed to create kmsg socket pair: %m");
1260                 goto finish;
1261         }
1262
1263         sd_notify(0, "READY=1");
1264
1265         assert_se(sigemptyset(&mask) == 0);
1266         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1267         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1268
1269         for (;;) {
1270                 siginfo_t status;
1271
1272                 sync_fd = eventfd(0, EFD_CLOEXEC);
1273                 if (sync_fd < 0) {
1274                         log_error("Failed to create event fd: %m");
1275                         goto finish;
1276                 }
1277
1278                 pid = syscall(__NR_clone,
1279                               SIGCHLD|CLONE_NEWNS|
1280                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1281                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1282                 if (pid < 0) {
1283                         if (errno == EINVAL)
1284                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1285                         else
1286                                 log_error("clone() failed: %m");
1287
1288                         goto finish;
1289                 }
1290
1291                 if (pid == 0) {
1292                         /* child */
1293                         const char *home = NULL;
1294                         uid_t uid = (uid_t) -1;
1295                         gid_t gid = (gid_t) -1;
1296                         unsigned n_env = 2;
1297                         const char *envp[] = {
1298                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1299                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1300                                 NULL, /* TERM */
1301                                 NULL, /* HOME */
1302                                 NULL, /* USER */
1303                                 NULL, /* LOGNAME */
1304                                 NULL, /* container_uuid */
1305                                 NULL, /* LISTEN_FDS */
1306                                 NULL, /* LISTEN_PID */
1307                                 NULL
1308                         };
1309                         char **env_use;
1310                         eventfd_t x;
1311
1312                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1313                         if (envp[n_env])
1314                                 n_env ++;
1315
1316                         close_nointr_nofail(master);
1317                         master = -1;
1318
1319                         close_nointr(STDIN_FILENO);
1320                         close_nointr(STDOUT_FILENO);
1321                         close_nointr(STDERR_FILENO);
1322
1323                         close_nointr_nofail(kmsg_socket_pair[0]);
1324                         kmsg_socket_pair[0] = -1;
1325
1326                         reset_all_signal_handlers();
1327
1328                         assert_se(sigemptyset(&mask) == 0);
1329                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1330
1331                         k = open_terminal(console, O_RDWR);
1332                         if (k != STDIN_FILENO) {
1333                                 if (k >= 0) {
1334                                         close_nointr_nofail(k);
1335                                         k = -EINVAL;
1336                                 }
1337
1338                                 log_error("Failed to open console: %s", strerror(-k));
1339                                 goto child_fail;
1340                         }
1341
1342                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1343                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1344                                 log_error("Failed to duplicate console: %m");
1345                                 goto child_fail;
1346                         }
1347
1348                         if (setsid() < 0) {
1349                                 log_error("setsid() failed: %m");
1350                                 goto child_fail;
1351                         }
1352
1353                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1354                                 log_error("PR_SET_PDEATHSIG failed: %m");
1355                                 goto child_fail;
1356                         }
1357
1358                         /* Mark everything as slave, so that we still
1359                          * receive mounts from the real root, but don't
1360                          * propagate mounts to the real root. */
1361                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1362                                 log_error("MS_SLAVE|MS_REC failed: %m");
1363                                 goto child_fail;
1364                         }
1365
1366                         /* Turn directory into bind mount */
1367                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1368                                 log_error("Failed to make bind mount.");
1369                                 goto child_fail;
1370                         }
1371
1372                         if (arg_read_only)
1373                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1374                                         log_error("Failed to make read-only.");
1375                                         goto child_fail;
1376                                 }
1377
1378                         if (mount_all(arg_directory) < 0)
1379                                 goto child_fail;
1380
1381                         if (copy_devnodes(arg_directory) < 0)
1382                                 goto child_fail;
1383
1384                         if (setup_ptmx(arg_directory) < 0)
1385                                 goto child_fail;
1386
1387                         dev_setup(arg_directory);
1388
1389                         if (setup_dev_console(arg_directory, console) < 0)
1390                                 goto child_fail;
1391
1392                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1393                                 goto child_fail;
1394
1395                         close_nointr_nofail(kmsg_socket_pair[1]);
1396                         kmsg_socket_pair[1] = -1;
1397
1398                         if (setup_boot_id(arg_directory) < 0)
1399                                 goto child_fail;
1400
1401                         if (setup_timezone(arg_directory) < 0)
1402                                 goto child_fail;
1403
1404                         if (setup_resolv_conf(arg_directory) < 0)
1405                                 goto child_fail;
1406
1407                         if (setup_journal(arg_directory) < 0)
1408                                 goto child_fail;
1409
1410                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1411                                 goto child_fail;
1412
1413                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1414                                 goto child_fail;
1415
1416                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1417                                 goto child_fail;
1418
1419                         if (chdir(arg_directory) < 0) {
1420                                 log_error("chdir(%s) failed: %m", arg_directory);
1421                                 goto child_fail;
1422                         }
1423
1424                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1425                                 log_error("mount(MS_MOVE) failed: %m");
1426                                 goto child_fail;
1427                         }
1428
1429                         if (chroot(".") < 0) {
1430                                 log_error("chroot() failed: %m");
1431                                 goto child_fail;
1432                         }
1433
1434                         if (chdir("/") < 0) {
1435                                 log_error("chdir() failed: %m");
1436                                 goto child_fail;
1437                         }
1438
1439                         umask(0022);
1440
1441                         loopback_setup();
1442
1443                         if (drop_capabilities() < 0) {
1444                                 log_error("drop_capabilities() failed: %m");
1445                                 goto child_fail;
1446                         }
1447
1448                         if (arg_user) {
1449
1450                                 /* Note that this resolves user names
1451                                  * inside the container, and hence
1452                                  * accesses the NSS modules from the
1453                                  * container and not the host. This is
1454                                  * a bit weird... */
1455
1456                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1457                                         log_error("get_user_creds() failed: %m");
1458                                         goto child_fail;
1459                                 }
1460
1461                                 if (mkdir_parents_label(home, 0775) < 0) {
1462                                         log_error("mkdir_parents_label() failed: %m");
1463                                         goto child_fail;
1464                                 }
1465
1466                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1467                                         log_error("mkdir_safe_label() failed: %m");
1468                                         goto child_fail;
1469                                 }
1470
1471                                 if (initgroups((const char*)arg_user, gid) < 0) {
1472                                         log_error("initgroups() failed: %m");
1473                                         goto child_fail;
1474                                 }
1475
1476                                 if (setresgid(gid, gid, gid) < 0) {
1477                                         log_error("setregid() failed: %m");
1478                                         goto child_fail;
1479                                 }
1480
1481                                 if (setresuid(uid, uid, uid) < 0) {
1482                                         log_error("setreuid() failed: %m");
1483                                         goto child_fail;
1484                                 }
1485                         } else {
1486                                 /* Reset everything fully to 0, just in case */
1487
1488                                 if (setgroups(0, NULL) < 0) {
1489                                         log_error("setgroups() failed: %m");
1490                                         goto child_fail;
1491                                 }
1492
1493                                 if (setresgid(0, 0, 0) < 0) {
1494                                         log_error("setregid() failed: %m");
1495                                         goto child_fail;
1496                                 }
1497
1498                                 if (setresuid(0, 0, 0) < 0) {
1499                                         log_error("setreuid() failed: %m");
1500                                         goto child_fail;
1501                                 }
1502                         }
1503
1504                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1505                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1506                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1507                                 log_oom();
1508                                 goto child_fail;
1509                         }
1510
1511                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1512                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1513                                         log_oom();
1514                                         goto child_fail;
1515                                 }
1516                         }
1517
1518                         if (fdset_size(fds) > 0) {
1519                                 k = fdset_cloexec(fds, false);
1520                                 if (k < 0) {
1521                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1522                                         goto child_fail;
1523                                 }
1524
1525                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1526                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1527                                         log_oom();
1528                                         goto child_fail;
1529                                 }
1530                         }
1531
1532                         setup_hostname();
1533
1534                         eventfd_read(sync_fd, &x);
1535                         close_nointr_nofail(sync_fd);
1536                         sync_fd = -1;
1537
1538                         if (!strv_isempty(arg_setenv)) {
1539                                 char **n;
1540
1541                                 n = strv_env_merge(2, envp, arg_setenv);
1542                                 if (!n) {
1543                                         log_oom();
1544                                         goto child_fail;
1545                                 }
1546
1547                                 env_use = n;
1548                         } else
1549                                 env_use = (char**) envp;
1550
1551 #ifdef HAVE_SELINUX
1552                         if (arg_selinux_context)
1553                                 if (setexeccon(arg_selinux_context) < 0)
1554                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1555 #endif
1556                         if (arg_boot) {
1557                                 char **a;
1558                                 size_t l;
1559
1560                                 /* Automatically search for the init system */
1561
1562                                 l = 1 + argc - optind;
1563                                 a = newa(char*, l + 1);
1564                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1565
1566                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1567                                 execve(a[0], a, env_use);
1568
1569                                 a[0] = (char*) "/lib/systemd/systemd";
1570                                 execve(a[0], a, env_use);
1571
1572                                 a[0] = (char*) "/sbin/init";
1573                                 execve(a[0], a, env_use);
1574                         } else if (argc > optind)
1575                                 execvpe(argv[optind], argv + optind, env_use);
1576                         else {
1577                                 chdir(home ? home : "/root");
1578                                 execle("/bin/bash", "-bash", NULL, env_use);
1579                         }
1580
1581                         log_error("execv() failed: %m");
1582
1583                 child_fail:
1584                         _exit(EXIT_FAILURE);
1585                 }
1586
1587                 fdset_free(fds);
1588                 fds = NULL;
1589
1590                 r = register_machine(pid);
1591                 if (r < 0)
1592                         goto finish;
1593
1594                 eventfd_write(sync_fd, 1);
1595                 close_nointr_nofail(sync_fd);
1596                 sync_fd = -1;
1597
1598                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1599                 if (k < 0) {
1600                         r = EXIT_FAILURE;
1601                         break;
1602                 }
1603
1604                 if (!arg_quiet)
1605                         putc('\n', stdout);
1606
1607                 /* Kill if it is not dead yet anyway */
1608                 terminate_machine(pid);
1609
1610                 /* Redundant, but better safe than sorry */
1611                 kill(pid, SIGKILL);
1612
1613                 k = wait_for_terminate(pid, &status);
1614                 pid = 0;
1615
1616                 if (k < 0) {
1617                         r = EXIT_FAILURE;
1618                         break;
1619                 }
1620
1621                 if (status.si_code == CLD_EXITED) {
1622                         r = status.si_status;
1623                         if (status.si_status != 0) {
1624                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1625                                 break;
1626                         }
1627
1628                         if (!arg_quiet)
1629                                 log_debug("Container %s exited successfully.", arg_machine);
1630                         break;
1631                 } else if (status.si_code == CLD_KILLED &&
1632                            status.si_status == SIGINT) {
1633
1634                         if (!arg_quiet)
1635                                 log_info("Container %s has been shut down.", arg_machine);
1636                         r = 0;
1637                         break;
1638                 } else if (status.si_code == CLD_KILLED &&
1639                            status.si_status == SIGHUP) {
1640
1641                         if (!arg_quiet)
1642                                 log_info("Container %s is being rebooted.", arg_machine);
1643                         continue;
1644                 } else if (status.si_code == CLD_KILLED ||
1645                            status.si_code == CLD_DUMPED) {
1646
1647                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1648                         r = EXIT_FAILURE;
1649                         break;
1650                 } else {
1651                         log_error("Container %s failed due to unknown reason.", arg_machine);
1652                         r = EXIT_FAILURE;
1653                         break;
1654                 }
1655         }
1656
1657 finish:
1658         if (pid > 0)
1659                 kill(pid, SIGKILL);
1660
1661         free(arg_directory);
1662         free(arg_machine);
1663         free(arg_setenv);
1664
1665         return r;
1666 }