chiark / gitweb /
646c6c02f387063e03e91c664ada0abe6331b27d
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #ifdef HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #include "sd-daemon.h"
49 #include "sd-bus.h"
50 #include "sd-id128.h"
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
62 #include "fdset.h"
63 #include "build.h"
64 #include "fileio.h"
65 #include "bus-util.h"
66 #include "bus-error.h"
67 #include "ptyfwd.h"
68 #include "bus-kernel.h"
69 #include "env-util.h"
70 #include "def.h"
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_selinux_context = NULL;
84 static char *arg_selinux_apifs_context = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
91         (1ULL << CAP_CHOWN) |
92         (1ULL << CAP_DAC_OVERRIDE) |
93         (1ULL << CAP_DAC_READ_SEARCH) |
94         (1ULL << CAP_FOWNER) |
95         (1ULL << CAP_FSETID) |
96         (1ULL << CAP_IPC_OWNER) |
97         (1ULL << CAP_KILL) |
98         (1ULL << CAP_LEASE) |
99         (1ULL << CAP_LINUX_IMMUTABLE) |
100         (1ULL << CAP_NET_BIND_SERVICE) |
101         (1ULL << CAP_NET_BROADCAST) |
102         (1ULL << CAP_NET_RAW) |
103         (1ULL << CAP_SETGID) |
104         (1ULL << CAP_SETFCAP) |
105         (1ULL << CAP_SETPCAP) |
106         (1ULL << CAP_SETUID) |
107         (1ULL << CAP_SYS_ADMIN) |
108         (1ULL << CAP_SYS_CHROOT) |
109         (1ULL << CAP_SYS_NICE) |
110         (1ULL << CAP_SYS_PTRACE) |
111         (1ULL << CAP_SYS_TTY_CONFIG) |
112         (1ULL << CAP_SYS_RESOURCE) |
113         (1ULL << CAP_SYS_BOOT) |
114         (1ULL << CAP_AUDIT_WRITE) |
115         (1ULL << CAP_AUDIT_CONTROL) |
116         (1ULL << CAP_MKNOD);
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120 static bool arg_quiet = false;
121
122 static int help(void) {
123
124         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
125                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
126                "  -h --help                 Show this help\n"
127                "     --version              Print version string\n"
128                "  -D --directory=NAME       Root directory for the container\n"
129                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
130                "  -u --user=USER            Run the command under specified user or uid\n"
131                "     --uuid=UUID            Set a specific machine UUID for the container\n"
132                "  -M --machine=NAME         Set the machine name for the container\n"
133                "  -S --slice=SLICE          Place the container in the specified slice\n"
134                "  -Z --selinux-context=SECLABEL\n"
135                "                            Set the SELinux security context to be used by\n"
136                "                            processes in the container\n"
137                "  -L --selinux-apifs-context=SECLABEL\n"
138                "                            Set the SELinux security context to be used by\n"
139                "                            API/tmpfs file systems in the container\n"
140                "     --private-network      Disable network in container\n"
141                "     --read-only            Mount the root directory read-only\n"
142                "     --capability=CAP       In addition to the default, retain specified\n"
143                "                            capability\n"
144                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
145                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
146                "  -j                        Equivalent to --link-journal=host\n"
147                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
148                "                            the container\n"
149                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
150                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
151                "  -q --quiet                Do not show status information\n",
152                program_invocation_short_name);
153
154         return 0;
155 }
156
157 static int parse_argv(int argc, char *argv[]) {
158
159         enum {
160                 ARG_VERSION = 0x100,
161                 ARG_PRIVATE_NETWORK,
162                 ARG_UUID,
163                 ARG_READ_ONLY,
164                 ARG_CAPABILITY,
165                 ARG_DROP_CAPABILITY,
166                 ARG_LINK_JOURNAL,
167                 ARG_BIND,
168                 ARG_BIND_RO,
169                 ARG_SETENV,
170         };
171
172         static const struct option options[] = {
173                 { "help",                  no_argument,       NULL, 'h'                 },
174                 { "version",               no_argument,       NULL, ARG_VERSION         },
175                 { "directory",             required_argument, NULL, 'D'                 },
176                 { "user",                  required_argument, NULL, 'u'                 },
177                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK },
178                 { "boot",                  no_argument,       NULL, 'b'                 },
179                 { "uuid",                  required_argument, NULL, ARG_UUID            },
180                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY       },
181                 { "capability",            required_argument, NULL, ARG_CAPABILITY      },
182                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY },
183                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL    },
184                 { "bind",                  required_argument, NULL, ARG_BIND            },
185                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO         },
186                 { "machine",               required_argument, NULL, 'M'                 },
187                 { "slice",                 required_argument, NULL, 'S'                 },
188                 { "setenv",                required_argument, NULL, ARG_SETENV          },
189                 { "selinux-context",       required_argument, NULL, 'Z'                 },
190                 { "selinux-apifs-context", required_argument, NULL, 'L'                 },
191                 { "quiet",                 no_argument,       NULL, 'q'                 },
192                 {}
193         };
194
195         int c, r;
196
197         assert(argc >= 0);
198         assert(argv);
199
200         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
201
202                 switch (c) {
203
204                 case 'h':
205                         return help();
206
207                 case ARG_VERSION:
208                         puts(PACKAGE_STRING);
209                         puts(SYSTEMD_FEATURES);
210                         return 0;
211
212                 case 'D':
213                         free(arg_directory);
214                         arg_directory = canonicalize_file_name(optarg);
215                         if (!arg_directory) {
216                                 log_error("Invalid root directory: %m");
217                                 return -ENOMEM;
218                         }
219
220                         break;
221
222                 case 'u':
223                         free(arg_user);
224                         arg_user = strdup(optarg);
225                         if (!arg_user)
226                                 return log_oom();
227
228                         break;
229
230                 case ARG_PRIVATE_NETWORK:
231                         arg_private_network = true;
232                         break;
233
234                 case 'b':
235                         arg_boot = true;
236                         break;
237
238                 case ARG_UUID:
239                         r = sd_id128_from_string(optarg, &arg_uuid);
240                         if (r < 0) {
241                                 log_error("Invalid UUID: %s", optarg);
242                                 return r;
243                         }
244                         break;
245
246                 case 'S':
247                         arg_slice = strdup(optarg);
248                         if (!arg_slice)
249                                 return log_oom();
250
251                         break;
252
253                 case 'M':
254                         if (!hostname_is_valid(optarg)) {
255                                 log_error("Invalid machine name: %s", optarg);
256                                 return -EINVAL;
257                         }
258
259                         free(arg_machine);
260                         arg_machine = strdup(optarg);
261                         if (!arg_machine)
262                                 return log_oom();
263
264                         break;
265
266                 case 'Z':
267                         arg_selinux_context = optarg;
268                         break;
269
270                 case 'L':
271                         arg_selinux_apifs_context = optarg;
272                         break;
273
274                 case ARG_READ_ONLY:
275                         arg_read_only = true;
276                         break;
277
278                 case ARG_CAPABILITY:
279                 case ARG_DROP_CAPABILITY: {
280                         char *state, *word;
281                         size_t length;
282
283                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
284                                 cap_value_t cap;
285                                 char *t;
286
287                                 t = strndup(word, length);
288                                 if (!t)
289                                         return log_oom();
290
291                                 if (cap_from_name(t, &cap) < 0) {
292                                         log_error("Failed to parse capability %s.", t);
293                                         free(t);
294                                         return -EINVAL;
295                                 }
296
297                                 free(t);
298
299                                 if (c == ARG_CAPABILITY)
300                                         arg_retain |= 1ULL << (uint64_t) cap;
301                                 else
302                                         arg_retain &= ~(1ULL << (uint64_t) cap);
303                         }
304
305                         break;
306                 }
307
308                 case 'j':
309                         arg_link_journal = LINK_GUEST;
310                         break;
311
312                 case ARG_LINK_JOURNAL:
313                         if (streq(optarg, "auto"))
314                                 arg_link_journal = LINK_AUTO;
315                         else if (streq(optarg, "no"))
316                                 arg_link_journal = LINK_NO;
317                         else if (streq(optarg, "guest"))
318                                 arg_link_journal = LINK_GUEST;
319                         else if (streq(optarg, "host"))
320                                 arg_link_journal = LINK_HOST;
321                         else {
322                                 log_error("Failed to parse link journal mode %s", optarg);
323                                 return -EINVAL;
324                         }
325
326                         break;
327
328                 case ARG_BIND:
329                 case ARG_BIND_RO: {
330                         _cleanup_free_ char *a = NULL, *b = NULL;
331                         char *e;
332                         char ***x;
333
334                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
335
336                         e = strchr(optarg, ':');
337                         if (e) {
338                                 a = strndup(optarg, e - optarg);
339                                 b = strdup(e + 1);
340                         } else {
341                                 a = strdup(optarg);
342                                 b = strdup(optarg);
343                         }
344
345                         if (!a || !b)
346                                 return log_oom();
347
348                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
349                                 log_error("Invalid bind mount specification: %s", optarg);
350                                 return -EINVAL;
351                         }
352
353                         r = strv_extend(x, a);
354                         if (r < 0)
355                                 return log_oom();
356
357                         r = strv_extend(x, b);
358                         if (r < 0)
359                                 return log_oom();
360
361                         break;
362                 }
363
364                 case ARG_SETENV: {
365                         char **n;
366
367                         if (!env_assignment_is_valid(optarg)) {
368                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
369                                 return -EINVAL;
370                         }
371
372                         n = strv_env_set(arg_setenv, optarg);
373                         if (!n)
374                                 return log_oom();
375
376                         strv_free(arg_setenv);
377                         arg_setenv = n;
378                         break;
379                 }
380
381                 case 'q':
382                         arg_quiet = true;
383                         break;
384
385                 case '?':
386                         return -EINVAL;
387
388                 default:
389                         assert_not_reached("Unhandled option");
390                 }
391         }
392
393         return 1;
394 }
395
396 static int mount_all(const char *dest) {
397
398         typedef struct MountPoint {
399                 const char *what;
400                 const char *where;
401                 const char *type;
402                 const char *options;
403                 unsigned long flags;
404                 bool fatal;
405         } MountPoint;
406
407         static const MountPoint mount_table[] = {
408                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
409                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
410                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
411                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
412                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
413                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
414                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
415                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
416 #ifdef HAVE_SELINUX
417                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
418                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
419 #endif
420         };
421
422         unsigned k;
423         int r = 0;
424
425         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
426                 _cleanup_free_ char *where = NULL;
427 #ifdef HAVE_SELINUX
428                 _cleanup_free_ char *options = NULL;
429 #endif
430                 const char *o;
431                 int t;
432
433                 where = strjoin(dest, "/", mount_table[k].where, NULL);
434                 if (!where)
435                         return log_oom();
436
437                 t = path_is_mount_point(where, true);
438                 if (t < 0) {
439                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
440
441                         if (r == 0)
442                                 r = t;
443
444                         continue;
445                 }
446
447                 /* Skip this entry if it is not a remount. */
448                 if (mount_table[k].what && t > 0)
449                         continue;
450
451                 mkdir_p(where, 0755);
452
453 #ifdef HAVE_SELINUX
454                 if (arg_selinux_apifs_context &&
455                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
456                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
457                         if (!options)
458                                 return log_oom();
459
460                         o = options;
461                 } else
462 #endif
463                         o = mount_table[k].options;
464
465
466                 if (mount(mount_table[k].what,
467                           where,
468                           mount_table[k].type,
469                           mount_table[k].flags,
470                           o) < 0 &&
471                     mount_table[k].fatal) {
472
473                         log_error("mount(%s) failed: %m", where);
474
475                         if (r == 0)
476                                 r = -errno;
477                 }
478         }
479
480         return r;
481 }
482
483 static int mount_binds(const char *dest, char **l, unsigned long flags) {
484         char **x, **y;
485
486         STRV_FOREACH_PAIR(x, y, l) {
487                 char *where;
488                 struct stat source_st, dest_st;
489                 int r;
490
491                 if (stat(*x, &source_st) < 0) {
492                         log_error("failed to stat %s: %m", *x);
493                         return -errno;
494                 }
495
496                 where = strappenda(dest, *y);
497                 r = stat(where, &dest_st);
498                 if (r == 0) {
499                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
500                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
501                                                 *x, where);
502                                 return -EINVAL;
503                         }
504                 } else if (errno == ENOENT) {
505                         r = mkdir_parents_label(where, 0755);
506                         if (r < 0) {
507                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
508                                 return r;
509                         }
510                 } else {
511                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
512                         return -errno;
513                 }
514                 /* Create the mount point, but be conservative -- refuse to create block
515                 * and char devices. */
516                 if (S_ISDIR(source_st.st_mode))
517                         mkdir_label(where, 0755);
518                 else if (S_ISFIFO(source_st.st_mode))
519                         mkfifo(where, 0644);
520                 else if (S_ISSOCK(source_st.st_mode))
521                         mknod(where, 0644 | S_IFSOCK, 0);
522                 else if (S_ISREG(source_st.st_mode))
523                         touch(where);
524                 else {
525                         log_error("Refusing to create mountpoint for file: %s", *x);
526                         return -ENOTSUP;
527                 }
528
529                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
530                         log_error("mount(%s) failed: %m", where);
531                         return -errno;
532                 }
533
534                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
535                         log_error("mount(%s) failed: %m", where);
536                         return -errno;
537                 }
538         }
539
540         return 0;
541 }
542
543 static int setup_timezone(const char *dest) {
544         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
545         char *z, *y;
546         int r;
547
548         assert(dest);
549
550         /* Fix the timezone, if possible */
551         r = readlink_malloc("/etc/localtime", &p);
552         if (r < 0) {
553                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
554                 return 0;
555         }
556
557         z = path_startswith(p, "../usr/share/zoneinfo/");
558         if (!z)
559                 z = path_startswith(p, "/usr/share/zoneinfo/");
560         if (!z) {
561                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
562                 return 0;
563         }
564
565         where = strappend(dest, "/etc/localtime");
566         if (!where)
567                 return log_oom();
568
569         r = readlink_malloc(where, &q);
570         if (r >= 0) {
571                 y = path_startswith(q, "../usr/share/zoneinfo/");
572                 if (!y)
573                         y = path_startswith(q, "/usr/share/zoneinfo/");
574
575
576                 /* Already pointing to the right place? Then do nothing .. */
577                 if (y && streq(y, z))
578                         return 0;
579         }
580
581         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
582         if (!check)
583                 return log_oom();
584
585         if (access(check, F_OK) < 0) {
586                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
587                 return 0;
588         }
589
590         what = strappend("../usr/share/zoneinfo/", z);
591         if (!what)
592                 return log_oom();
593
594         unlink(where);
595         if (symlink(what, where) < 0) {
596                 log_error("Failed to correct timezone of container: %m");
597                 return 0;
598         }
599
600         return 0;
601 }
602
603 static int setup_resolv_conf(const char *dest) {
604         char _cleanup_free_ *where = NULL;
605
606         assert(dest);
607
608         if (arg_private_network)
609                 return 0;
610
611         /* Fix resolv.conf, if possible */
612         where = strappend(dest, "/etc/resolv.conf");
613         if (!where)
614                 return log_oom();
615
616         /* We don't really care for the results of this really. If it
617          * fails, it fails, but meh... */
618         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
619
620         return 0;
621 }
622
623 static int setup_boot_id(const char *dest) {
624         _cleanup_free_ char *from = NULL, *to = NULL;
625         sd_id128_t rnd;
626         char as_uuid[37];
627         int r;
628
629         assert(dest);
630
631         /* Generate a new randomized boot ID, so that each boot-up of
632          * the container gets a new one */
633
634         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
635         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
636         if (!from || !to)
637                 return log_oom();
638
639         r = sd_id128_randomize(&rnd);
640         if (r < 0) {
641                 log_error("Failed to generate random boot id: %s", strerror(-r));
642                 return r;
643         }
644
645         snprintf(as_uuid, sizeof(as_uuid),
646                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
647                  SD_ID128_FORMAT_VAL(rnd));
648         char_array_0(as_uuid);
649
650         r = write_string_file(from, as_uuid);
651         if (r < 0) {
652                 log_error("Failed to write boot id: %s", strerror(-r));
653                 return r;
654         }
655
656         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
657                 log_error("Failed to bind mount boot id: %m");
658                 r = -errno;
659         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
660                 log_warning("Failed to make boot id read-only: %m");
661
662         unlink(from);
663         return r;
664 }
665
666 static int copy_devnodes(const char *dest) {
667
668         static const char devnodes[] =
669                 "null\0"
670                 "zero\0"
671                 "full\0"
672                 "random\0"
673                 "urandom\0"
674                 "tty\0";
675
676         const char *d;
677         int r = 0;
678         _cleanup_umask_ mode_t u;
679
680         assert(dest);
681
682         u = umask(0000);
683
684         NULSTR_FOREACH(d, devnodes) {
685                 _cleanup_free_ char *from = NULL, *to = NULL;
686                 struct stat st;
687
688                 from = strappend("/dev/", d);
689                 to = strjoin(dest, "/dev/", d, NULL);
690                 if (!from || !to)
691                         return log_oom();
692
693                 if (stat(from, &st) < 0) {
694
695                         if (errno != ENOENT) {
696                                 log_error("Failed to stat %s: %m", from);
697                                 return -errno;
698                         }
699
700                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
701
702                         log_error("%s is not a char or block device, cannot copy", from);
703                         return -EIO;
704
705                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
706
707                         log_error("mknod(%s) failed: %m", dest);
708                         return  -errno;
709                 }
710         }
711
712         return r;
713 }
714
715 static int setup_ptmx(const char *dest) {
716         _cleanup_free_ char *p = NULL;
717
718         p = strappend(dest, "/dev/ptmx");
719         if (!p)
720                 return log_oom();
721
722         if (symlink("pts/ptmx", p) < 0) {
723                 log_error("Failed to create /dev/ptmx symlink: %m");
724                 return -errno;
725         }
726
727         return 0;
728 }
729
730 static int setup_dev_console(const char *dest, const char *console) {
731         struct stat st;
732         _cleanup_free_ char *to = NULL;
733         int r;
734         _cleanup_umask_ mode_t u;
735
736         assert(dest);
737         assert(console);
738
739         u = umask(0000);
740
741         if (stat(console, &st) < 0) {
742                 log_error("Failed to stat %s: %m", console);
743                 return -errno;
744
745         } else if (!S_ISCHR(st.st_mode)) {
746                 log_error("/dev/console is not a char device");
747                 return -EIO;
748         }
749
750         r = chmod_and_chown(console, 0600, 0, 0);
751         if (r < 0) {
752                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
753                 return r;
754         }
755
756         if (asprintf(&to, "%s/dev/console", dest) < 0)
757                 return log_oom();
758
759         /* We need to bind mount the right tty to /dev/console since
760          * ptys can only exist on pts file systems. To have something
761          * to bind mount things on we create a device node first, that
762          * has the right major/minor (note that the major minor
763          * doesn't actually matter here, since we mount it over
764          * anyway). */
765
766         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
767                 log_error("mknod() for /dev/console failed: %m");
768                 return -errno;
769         }
770
771         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
772                 log_error("Bind mount for /dev/console failed: %m");
773                 return -errno;
774         }
775
776         return 0;
777 }
778
779 static int setup_kmsg(const char *dest, int kmsg_socket) {
780         _cleanup_free_ char *from = NULL, *to = NULL;
781         int r, fd, k;
782         _cleanup_umask_ mode_t u;
783         union {
784                 struct cmsghdr cmsghdr;
785                 uint8_t buf[CMSG_SPACE(sizeof(int))];
786         } control = {};
787         struct msghdr mh = {
788                 .msg_control = &control,
789                 .msg_controllen = sizeof(control),
790         };
791         struct cmsghdr *cmsg;
792
793         assert(dest);
794         assert(kmsg_socket >= 0);
795
796         u = umask(0000);
797
798         /* We create the kmsg FIFO as /dev/kmsg, but immediately
799          * delete it after bind mounting it to /proc/kmsg. While FIFOs
800          * on the reading side behave very similar to /proc/kmsg,
801          * their writing side behaves differently from /dev/kmsg in
802          * that writing blocks when nothing is reading. In order to
803          * avoid any problems with containers deadlocking due to this
804          * we simply make /dev/kmsg unavailable to the container. */
805         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
806             asprintf(&to, "%s/proc/kmsg", dest) < 0)
807                 return log_oom();
808
809         if (mkfifo(from, 0600) < 0) {
810                 log_error("mkfifo() for /dev/kmsg failed: %m");
811                 return -errno;
812         }
813
814         r = chmod_and_chown(from, 0600, 0, 0);
815         if (r < 0) {
816                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
817                 return r;
818         }
819
820         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
821                 log_error("Bind mount for /proc/kmsg failed: %m");
822                 return -errno;
823         }
824
825         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
826         if (fd < 0) {
827                 log_error("Failed to open fifo: %m");
828                 return -errno;
829         }
830
831         cmsg = CMSG_FIRSTHDR(&mh);
832         cmsg->cmsg_level = SOL_SOCKET;
833         cmsg->cmsg_type = SCM_RIGHTS;
834         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
835         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
836
837         mh.msg_controllen = cmsg->cmsg_len;
838
839         /* Store away the fd in the socket, so that it stays open as
840          * long as we run the child */
841         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
842         close_nointr_nofail(fd);
843
844         if (k < 0) {
845                 log_error("Failed to send FIFO fd: %m");
846                 return -errno;
847         }
848
849         /* And now make the FIFO unavailable as /dev/kmsg... */
850         unlink(from);
851         return 0;
852 }
853
854 static int setup_hostname(void) {
855
856         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
857                 return -errno;
858
859         return 0;
860 }
861
862 static int setup_journal(const char *directory) {
863         sd_id128_t machine_id, this_id;
864         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
865         char *id;
866         int r;
867
868         p = strappend(directory, "/etc/machine-id");
869         if (!p)
870                 return log_oom();
871
872         r = read_one_line_file(p, &b);
873         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
874                 return 0;
875         else if (r < 0) {
876                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
877                 return r;
878         }
879
880         id = strstrip(b);
881         if (isempty(id) && arg_link_journal == LINK_AUTO)
882                 return 0;
883
884         /* Verify validity */
885         r = sd_id128_from_string(id, &machine_id);
886         if (r < 0) {
887                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
888                 return r;
889         }
890
891         r = sd_id128_get_machine(&this_id);
892         if (r < 0) {
893                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
894                 return r;
895         }
896
897         if (sd_id128_equal(machine_id, this_id)) {
898                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
899                          "Host and machine ids are equal (%s): refusing to link journals", id);
900                 if (arg_link_journal == LINK_AUTO)
901                         return 0;
902                 return
903                         -EEXIST;
904         }
905
906         if (arg_link_journal == LINK_NO)
907                 return 0;
908
909         free(p);
910         p = strappend("/var/log/journal/", id);
911         q = strjoin(directory, "/var/log/journal/", id, NULL);
912         if (!p || !q)
913                 return log_oom();
914
915         if (path_is_mount_point(p, false) > 0) {
916                 if (arg_link_journal != LINK_AUTO) {
917                         log_error("%s: already a mount point, refusing to use for journal", p);
918                         return -EEXIST;
919                 }
920
921                 return 0;
922         }
923
924         if (path_is_mount_point(q, false) > 0) {
925                 if (arg_link_journal != LINK_AUTO) {
926                         log_error("%s: already a mount point, refusing to use for journal", q);
927                         return -EEXIST;
928                 }
929
930                 return 0;
931         }
932
933         r = readlink_and_make_absolute(p, &d);
934         if (r >= 0) {
935                 if ((arg_link_journal == LINK_GUEST ||
936                      arg_link_journal == LINK_AUTO) &&
937                     path_equal(d, q)) {
938
939                         r = mkdir_p(q, 0755);
940                         if (r < 0)
941                                 log_warning("failed to create directory %s: %m", q);
942                         return 0;
943                 }
944
945                 if (unlink(p) < 0) {
946                         log_error("Failed to remove symlink %s: %m", p);
947                         return -errno;
948                 }
949         } else if (r == -EINVAL) {
950
951                 if (arg_link_journal == LINK_GUEST &&
952                     rmdir(p) < 0) {
953
954                         if (errno == ENOTDIR) {
955                                 log_error("%s already exists and is neither a symlink nor a directory", p);
956                                 return r;
957                         } else {
958                                 log_error("Failed to remove %s: %m", p);
959                                 return -errno;
960                         }
961                 }
962         } else if (r != -ENOENT) {
963                 log_error("readlink(%s) failed: %m", p);
964                 return r;
965         }
966
967         if (arg_link_journal == LINK_GUEST) {
968
969                 if (symlink(q, p) < 0) {
970                         log_error("Failed to symlink %s to %s: %m", q, p);
971                         return -errno;
972                 }
973
974                 r = mkdir_p(q, 0755);
975                 if (r < 0)
976                         log_warning("failed to create directory %s: %m", q);
977                 return 0;
978         }
979
980         if (arg_link_journal == LINK_HOST) {
981                 r = mkdir_p(p, 0755);
982                 if (r < 0) {
983                         log_error("Failed to create %s: %m", p);
984                         return r;
985                 }
986
987         } else if (access(p, F_OK) < 0)
988                 return 0;
989
990         if (dir_is_empty(q) == 0) {
991                 log_error("%s not empty.", q);
992                 return -ENOTEMPTY;
993         }
994
995         r = mkdir_p(q, 0755);
996         if (r < 0) {
997                 log_error("Failed to create %s: %m", q);
998                 return r;
999         }
1000
1001         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1002                 log_error("Failed to bind mount journal from host into guest: %m");
1003                 return -errno;
1004         }
1005
1006         return 0;
1007 }
1008
1009 static int setup_kdbus(const char *dest, const char *path) {
1010         const char *p;
1011
1012         if (!path)
1013                 return 0;
1014
1015         p = strappenda(dest, "/dev/kdbus");
1016         if (mkdir(p, 0755) < 0) {
1017                 log_error("Failed to create kdbus path: %m");
1018                 return  -errno;
1019         }
1020
1021         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1022                 log_error("Failed to mount kdbus domain path: %m");
1023                 return -errno;
1024         }
1025
1026         return 0;
1027 }
1028
1029 static int drop_capabilities(void) {
1030         return capability_bounding_set_drop(~arg_retain, false);
1031 }
1032
1033 static int register_machine(pid_t pid) {
1034         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1035         _cleanup_bus_unref_ sd_bus *bus = NULL;
1036         int r;
1037
1038         r = sd_bus_default_system(&bus);
1039         if (r < 0) {
1040                 log_error("Failed to open system bus: %s", strerror(-r));
1041                 return r;
1042         }
1043
1044         r = sd_bus_call_method(
1045                         bus,
1046                         "org.freedesktop.machine1",
1047                         "/org/freedesktop/machine1",
1048                         "org.freedesktop.machine1.Manager",
1049                         "CreateMachine",
1050                         &error,
1051                         NULL,
1052                         "sayssusa(sv)",
1053                         arg_machine,
1054                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1055                         "nspawn",
1056                         "container",
1057                         (uint32_t) pid,
1058                         strempty(arg_directory),
1059                         !isempty(arg_slice), "Slice", "s", arg_slice);
1060         if (r < 0) {
1061                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1062                 return r;
1063         }
1064
1065         return 0;
1066 }
1067
1068 static int terminate_machine(pid_t pid) {
1069         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1070         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1071         _cleanup_bus_unref_ sd_bus *bus = NULL;
1072         const char *path;
1073         int r;
1074
1075         r = sd_bus_default_system(&bus);
1076         if (r < 0) {
1077                 log_error("Failed to open system bus: %s", strerror(-r));
1078                 return r;
1079         }
1080
1081         r = sd_bus_call_method(
1082                         bus,
1083                         "org.freedesktop.machine1",
1084                         "/org/freedesktop/machine1",
1085                         "org.freedesktop.machine1.Manager",
1086                         "GetMachineByPID",
1087                         &error,
1088                         &reply,
1089                         "u",
1090                         (uint32_t) pid);
1091         if (r < 0) {
1092                 /* Note that the machine might already have been
1093                  * cleaned up automatically, hence don't consider it a
1094                  * failure if we cannot get the machine object. */
1095                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1096                 return 0;
1097         }
1098
1099         r = sd_bus_message_read(reply, "o", &path);
1100         if (r < 0)
1101                 return bus_log_parse_error(r);
1102
1103         r = sd_bus_call_method(
1104                         bus,
1105                         "org.freedesktop.machine1",
1106                         path,
1107                         "org.freedesktop.machine1.Machine",
1108                         "Terminate",
1109                         &error,
1110                         NULL,
1111                         NULL);
1112         if (r < 0) {
1113                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1114                 return 0;
1115         }
1116
1117         return 0;
1118 }
1119
1120 static bool audit_enabled(void) {
1121         int fd;
1122
1123         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1124         if (fd >= 0) {
1125                 close_nointr_nofail(fd);
1126                 return true;
1127         }
1128         return false;
1129 }
1130
1131 int main(int argc, char *argv[]) {
1132         pid_t pid = 0;
1133         int r = EXIT_FAILURE, k;
1134         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1135         int n_fd_passed;
1136         const char *console = NULL;
1137         sigset_t mask;
1138         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1139         _cleanup_fdset_free_ FDSet *fds = NULL;
1140         _cleanup_free_ char *kdbus_domain = NULL;
1141         const char *ns;
1142
1143         log_parse_environment();
1144         log_open();
1145
1146         k = parse_argv(argc, argv);
1147         if (k < 0)
1148                 goto finish;
1149         else if (k == 0) {
1150                 r = EXIT_SUCCESS;
1151                 goto finish;
1152         }
1153
1154         if (arg_directory) {
1155                 char *p;
1156
1157                 p = path_make_absolute_cwd(arg_directory);
1158                 free(arg_directory);
1159                 arg_directory = p;
1160         } else
1161                 arg_directory = get_current_dir_name();
1162
1163         if (!arg_directory) {
1164                 log_error("Failed to determine path, please use -D.");
1165                 goto finish;
1166         }
1167
1168         path_kill_slashes(arg_directory);
1169
1170         if (!arg_machine) {
1171                 arg_machine = strdup(basename(arg_directory));
1172                 if (!arg_machine) {
1173                         log_oom();
1174                         goto finish;
1175                 }
1176
1177                 hostname_cleanup(arg_machine, false);
1178                 if (isempty(arg_machine)) {
1179                         log_error("Failed to determine machine name automatically, please use -M.");
1180                         goto finish;
1181                 }
1182         }
1183
1184         if (geteuid() != 0) {
1185                 log_error("Need to be root.");
1186                 goto finish;
1187         }
1188
1189         if (sd_booted() <= 0) {
1190                 log_error("Not running on a systemd system.");
1191                 goto finish;
1192         }
1193
1194         if (arg_boot && audit_enabled()) {
1195                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1196                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1197                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1198                 sleep(5);
1199         }
1200
1201         if (path_equal(arg_directory, "/")) {
1202                 log_error("Spawning container on root directory not supported.");
1203                 goto finish;
1204         }
1205
1206         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1207                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1208                 goto finish;
1209         }
1210
1211         log_close();
1212         n_fd_passed = sd_listen_fds(false);
1213         if (n_fd_passed > 0) {
1214                 k = fdset_new_listen_fds(&fds, false);
1215                 if (k < 0) {
1216                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1217                         goto finish;
1218                 }
1219         }
1220         fdset_close_others(fds);
1221         log_open();
1222
1223         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1224         if (master < 0) {
1225                 log_error("Failed to acquire pseudo tty: %m");
1226                 goto finish;
1227         }
1228
1229         console = ptsname(master);
1230         if (!console) {
1231                 log_error("Failed to determine tty name: %m");
1232                 goto finish;
1233         }
1234
1235         if (!arg_quiet)
1236                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1237
1238         if (unlockpt(master) < 0) {
1239                 log_error("Failed to unlock tty: %m");
1240                 goto finish;
1241         }
1242
1243         ns = strappenda("machine-", arg_machine);
1244         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1245         if (r < 0)
1246                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1247         else
1248                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1249
1250         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1251                 log_error("Failed to create kmsg socket pair: %m");
1252                 goto finish;
1253         }
1254
1255         sd_notify(0, "READY=1");
1256
1257         assert_se(sigemptyset(&mask) == 0);
1258         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1259         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1260
1261         for (;;) {
1262                 siginfo_t status;
1263
1264                 sync_fd = eventfd(0, EFD_CLOEXEC);
1265                 if (sync_fd < 0) {
1266                         log_error("Failed to create event fd: %m");
1267                         goto finish;
1268                 }
1269
1270                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1271                 if (pid < 0) {
1272                         if (errno == EINVAL)
1273                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1274                         else
1275                                 log_error("clone() failed: %m");
1276
1277                         goto finish;
1278                 }
1279
1280                 if (pid == 0) {
1281                         /* child */
1282                         const char *home = NULL;
1283                         uid_t uid = (uid_t) -1;
1284                         gid_t gid = (gid_t) -1;
1285                         unsigned n_env = 2;
1286                         const char *envp[] = {
1287                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1288                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1289                                 NULL, /* TERM */
1290                                 NULL, /* HOME */
1291                                 NULL, /* USER */
1292                                 NULL, /* LOGNAME */
1293                                 NULL, /* container_uuid */
1294                                 NULL, /* LISTEN_FDS */
1295                                 NULL, /* LISTEN_PID */
1296                                 NULL
1297                         };
1298                         char **env_use;
1299                         eventfd_t x;
1300
1301                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1302                         if (envp[n_env])
1303                                 n_env ++;
1304
1305                         close_nointr_nofail(master);
1306                         master = -1;
1307
1308                         close_nointr(STDIN_FILENO);
1309                         close_nointr(STDOUT_FILENO);
1310                         close_nointr(STDERR_FILENO);
1311
1312                         close_nointr_nofail(kmsg_socket_pair[0]);
1313                         kmsg_socket_pair[0] = -1;
1314
1315                         reset_all_signal_handlers();
1316
1317                         assert_se(sigemptyset(&mask) == 0);
1318                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1319
1320                         k = open_terminal(console, O_RDWR);
1321                         if (k != STDIN_FILENO) {
1322                                 if (k >= 0) {
1323                                         close_nointr_nofail(k);
1324                                         k = -EINVAL;
1325                                 }
1326
1327                                 log_error("Failed to open console: %s", strerror(-k));
1328                                 goto child_fail;
1329                         }
1330
1331                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1332                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1333                                 log_error("Failed to duplicate console: %m");
1334                                 goto child_fail;
1335                         }
1336
1337                         if (setsid() < 0) {
1338                                 log_error("setsid() failed: %m");
1339                                 goto child_fail;
1340                         }
1341
1342                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1343                                 log_error("PR_SET_PDEATHSIG failed: %m");
1344                                 goto child_fail;
1345                         }
1346
1347                         /* Mark everything as slave, so that we still
1348                          * receive mounts from the real root, but don't
1349                          * propagate mounts to the real root. */
1350                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1351                                 log_error("MS_SLAVE|MS_REC failed: %m");
1352                                 goto child_fail;
1353                         }
1354
1355                         /* Turn directory into bind mount */
1356                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1357                                 log_error("Failed to make bind mount.");
1358                                 goto child_fail;
1359                         }
1360
1361                         if (arg_read_only)
1362                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1363                                         log_error("Failed to make read-only.");
1364                                         goto child_fail;
1365                                 }
1366
1367                         if (mount_all(arg_directory) < 0)
1368                                 goto child_fail;
1369
1370                         if (copy_devnodes(arg_directory) < 0)
1371                                 goto child_fail;
1372
1373                         if (setup_ptmx(arg_directory) < 0)
1374                                 goto child_fail;
1375
1376                         dev_setup(arg_directory);
1377
1378                         if (setup_dev_console(arg_directory, console) < 0)
1379                                 goto child_fail;
1380
1381                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1382                                 goto child_fail;
1383
1384                         close_nointr_nofail(kmsg_socket_pair[1]);
1385                         kmsg_socket_pair[1] = -1;
1386
1387                         if (setup_boot_id(arg_directory) < 0)
1388                                 goto child_fail;
1389
1390                         if (setup_timezone(arg_directory) < 0)
1391                                 goto child_fail;
1392
1393                         if (setup_resolv_conf(arg_directory) < 0)
1394                                 goto child_fail;
1395
1396                         if (setup_journal(arg_directory) < 0)
1397                                 goto child_fail;
1398
1399                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1400                                 goto child_fail;
1401
1402                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1403                                 goto child_fail;
1404
1405                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1406                                 goto child_fail;
1407
1408                         if (chdir(arg_directory) < 0) {
1409                                 log_error("chdir(%s) failed: %m", arg_directory);
1410                                 goto child_fail;
1411                         }
1412
1413                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1414                                 log_error("mount(MS_MOVE) failed: %m");
1415                                 goto child_fail;
1416                         }
1417
1418                         if (chroot(".") < 0) {
1419                                 log_error("chroot() failed: %m");
1420                                 goto child_fail;
1421                         }
1422
1423                         if (chdir("/") < 0) {
1424                                 log_error("chdir() failed: %m");
1425                                 goto child_fail;
1426                         }
1427
1428                         umask(0022);
1429
1430                         loopback_setup();
1431
1432                         if (drop_capabilities() < 0) {
1433                                 log_error("drop_capabilities() failed: %m");
1434                                 goto child_fail;
1435                         }
1436
1437                         if (arg_user) {
1438
1439                                 /* Note that this resolves user names
1440                                  * inside the container, and hence
1441                                  * accesses the NSS modules from the
1442                                  * container and not the host. This is
1443                                  * a bit weird... */
1444
1445                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1446                                         log_error("get_user_creds() failed: %m");
1447                                         goto child_fail;
1448                                 }
1449
1450                                 if (mkdir_parents_label(home, 0775) < 0) {
1451                                         log_error("mkdir_parents_label() failed: %m");
1452                                         goto child_fail;
1453                                 }
1454
1455                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1456                                         log_error("mkdir_safe_label() failed: %m");
1457                                         goto child_fail;
1458                                 }
1459
1460                                 if (initgroups((const char*)arg_user, gid) < 0) {
1461                                         log_error("initgroups() failed: %m");
1462                                         goto child_fail;
1463                                 }
1464
1465                                 if (setresgid(gid, gid, gid) < 0) {
1466                                         log_error("setregid() failed: %m");
1467                                         goto child_fail;
1468                                 }
1469
1470                                 if (setresuid(uid, uid, uid) < 0) {
1471                                         log_error("setreuid() failed: %m");
1472                                         goto child_fail;
1473                                 }
1474                         } else {
1475                                 /* Reset everything fully to 0, just in case */
1476
1477                                 if (setgroups(0, NULL) < 0) {
1478                                         log_error("setgroups() failed: %m");
1479                                         goto child_fail;
1480                                 }
1481
1482                                 if (setresgid(0, 0, 0) < 0) {
1483                                         log_error("setregid() failed: %m");
1484                                         goto child_fail;
1485                                 }
1486
1487                                 if (setresuid(0, 0, 0) < 0) {
1488                                         log_error("setreuid() failed: %m");
1489                                         goto child_fail;
1490                                 }
1491                         }
1492
1493                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1494                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1495                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1496                                 log_oom();
1497                                 goto child_fail;
1498                         }
1499
1500                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1501                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1502                                         log_oom();
1503                                         goto child_fail;
1504                                 }
1505                         }
1506
1507                         if (fdset_size(fds) > 0) {
1508                                 k = fdset_cloexec(fds, false);
1509                                 if (k < 0) {
1510                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1511                                         goto child_fail;
1512                                 }
1513
1514                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1515                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1516                                         log_oom();
1517                                         goto child_fail;
1518                                 }
1519                         }
1520
1521                         setup_hostname();
1522
1523                         eventfd_read(sync_fd, &x);
1524                         close_nointr_nofail(sync_fd);
1525                         sync_fd = -1;
1526
1527                         if (!strv_isempty(arg_setenv)) {
1528                                 char **n;
1529
1530                                 n = strv_env_merge(2, envp, arg_setenv);
1531                                 if (!n) {
1532                                         log_oom();
1533                                         goto child_fail;
1534                                 }
1535
1536                                 env_use = n;
1537                         } else
1538                                 env_use = (char**) envp;
1539
1540 #ifdef HAVE_SELINUX
1541                         if (arg_selinux_context)
1542                                 if (setexeccon(arg_selinux_context) < 0)
1543                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1544 #endif
1545                         if (arg_boot) {
1546                                 char **a;
1547                                 size_t l;
1548
1549                                 /* Automatically search for the init system */
1550
1551                                 l = 1 + argc - optind;
1552                                 a = newa(char*, l + 1);
1553                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1554
1555                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1556                                 execve(a[0], a, env_use);
1557
1558                                 a[0] = (char*) "/lib/systemd/systemd";
1559                                 execve(a[0], a, env_use);
1560
1561                                 a[0] = (char*) "/sbin/init";
1562                                 execve(a[0], a, env_use);
1563                         } else if (argc > optind)
1564                                 execvpe(argv[optind], argv + optind, env_use);
1565                         else {
1566                                 chdir(home ? home : "/root");
1567                                 execle("/bin/bash", "-bash", NULL, env_use);
1568                         }
1569
1570                         log_error("execv() failed: %m");
1571
1572                 child_fail:
1573                         _exit(EXIT_FAILURE);
1574                 }
1575
1576                 fdset_free(fds);
1577                 fds = NULL;
1578
1579                 r = register_machine(pid);
1580                 if (r < 0)
1581                         goto finish;
1582
1583                 eventfd_write(sync_fd, 1);
1584                 close_nointr_nofail(sync_fd);
1585                 sync_fd = -1;
1586
1587                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1588                 if (k < 0) {
1589                         r = EXIT_FAILURE;
1590                         break;
1591                 }
1592
1593                 if (!arg_quiet)
1594                         putc('\n', stdout);
1595
1596                 /* Kill if it is not dead yet anyway */
1597                 terminate_machine(pid);
1598
1599                 /* Redundant, but better safe than sorry */
1600                 kill(pid, SIGKILL);
1601
1602                 k = wait_for_terminate(pid, &status);
1603                 pid = 0;
1604
1605                 if (k < 0) {
1606                         r = EXIT_FAILURE;
1607                         break;
1608                 }
1609
1610                 if (status.si_code == CLD_EXITED) {
1611                         r = status.si_status;
1612                         if (status.si_status != 0) {
1613                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1614                                 break;
1615                         }
1616
1617                         if (!arg_quiet)
1618                                 log_debug("Container %s exited successfully.", arg_machine);
1619                         break;
1620                 } else if (status.si_code == CLD_KILLED &&
1621                            status.si_status == SIGINT) {
1622
1623                         if (!arg_quiet)
1624                                 log_info("Container %s has been shut down.", arg_machine);
1625                         r = 0;
1626                         break;
1627                 } else if (status.si_code == CLD_KILLED &&
1628                            status.si_status == SIGHUP) {
1629
1630                         if (!arg_quiet)
1631                                 log_info("Container %s is being rebooted.", arg_machine);
1632                         continue;
1633                 } else if (status.si_code == CLD_KILLED ||
1634                            status.si_code == CLD_DUMPED) {
1635
1636                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1637                         r = EXIT_FAILURE;
1638                         break;
1639                 } else {
1640                         log_error("Container %s failed due to unknown reason.", arg_machine);
1641                         r = EXIT_FAILURE;
1642                         break;
1643                 }
1644         }
1645
1646 finish:
1647         if (pid > 0)
1648                 kill(pid, SIGKILL);
1649
1650         free(arg_directory);
1651         free(arg_machine);
1652         free(arg_setenv);
1653
1654         return r;
1655 }