chiark / gitweb /
160b50b3ed83c8042c025da6bd46246ab11d7800
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
45 #include <net/if.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #include "sd-daemon.h"
52 #include "sd-bus.h"
53 #include "sd-id128.h"
54 #include "sd-rtnl.h"
55 #include "log.h"
56 #include "util.h"
57 #include "mkdir.h"
58 #include "macro.h"
59 #include "audit.h"
60 #include "missing.h"
61 #include "cgroup-util.h"
62 #include "strv.h"
63 #include "path-util.h"
64 #include "loopback-setup.h"
65 #include "dev-setup.h"
66 #include "fdset.h"
67 #include "build.h"
68 #include "fileio.h"
69 #include "bus-util.h"
70 #include "bus-error.h"
71 #include "ptyfwd.h"
72 #include "bus-kernel.h"
73 #include "env-util.h"
74 #include "def.h"
75 #include "rtnl-util.h"
76
77 typedef enum LinkJournal {
78         LINK_NO,
79         LINK_AUTO,
80         LINK_HOST,
81         LINK_GUEST
82 } LinkJournal;
83
84 static char *arg_directory = NULL;
85 static char *arg_user = NULL;
86 static sd_id128_t arg_uuid = {};
87 static char *arg_machine = NULL;
88 static char *arg_selinux_context = NULL;
89 static char *arg_selinux_apifs_context = NULL;
90 static const char *arg_slice = NULL;
91 static bool arg_private_network = false;
92 static bool arg_read_only = false;
93 static bool arg_boot = false;
94 static LinkJournal arg_link_journal = LINK_AUTO;
95 static uint64_t arg_retain =
96         (1ULL << CAP_CHOWN) |
97         (1ULL << CAP_DAC_OVERRIDE) |
98         (1ULL << CAP_DAC_READ_SEARCH) |
99         (1ULL << CAP_FOWNER) |
100         (1ULL << CAP_FSETID) |
101         (1ULL << CAP_IPC_OWNER) |
102         (1ULL << CAP_KILL) |
103         (1ULL << CAP_LEASE) |
104         (1ULL << CAP_LINUX_IMMUTABLE) |
105         (1ULL << CAP_NET_BIND_SERVICE) |
106         (1ULL << CAP_NET_BROADCAST) |
107         (1ULL << CAP_NET_RAW) |
108         (1ULL << CAP_SETGID) |
109         (1ULL << CAP_SETFCAP) |
110         (1ULL << CAP_SETPCAP) |
111         (1ULL << CAP_SETUID) |
112         (1ULL << CAP_SYS_ADMIN) |
113         (1ULL << CAP_SYS_CHROOT) |
114         (1ULL << CAP_SYS_NICE) |
115         (1ULL << CAP_SYS_PTRACE) |
116         (1ULL << CAP_SYS_TTY_CONFIG) |
117         (1ULL << CAP_SYS_RESOURCE) |
118         (1ULL << CAP_SYS_BOOT) |
119         (1ULL << CAP_AUDIT_WRITE) |
120         (1ULL << CAP_AUDIT_CONTROL) |
121         (1ULL << CAP_MKNOD);
122 static char **arg_bind = NULL;
123 static char **arg_bind_ro = NULL;
124 static char **arg_setenv = NULL;
125 static bool arg_quiet = false;
126 static bool arg_share_system = false;
127 static bool arg_register = true;
128 static bool arg_keep_unit = false;
129 static char **arg_network_interfaces = NULL;
130
131 static int help(void) {
132
133         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
134                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
135                "  -h --help                 Show this help\n"
136                "     --version              Print version string\n"
137                "  -D --directory=NAME       Root directory for the container\n"
138                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
139                "  -u --user=USER            Run the command under specified user or uid\n"
140                "     --uuid=UUID            Set a specific machine UUID for the container\n"
141                "  -M --machine=NAME         Set the machine name for the container\n"
142                "  -S --slice=SLICE          Place the container in the specified slice\n"
143                "  -Z --selinux-context=SECLABEL\n"
144                "                            Set the SELinux security context to be used by\n"
145                "                            processes in the container\n"
146                "  -L --selinux-apifs-context=SECLABEL\n"
147                "                            Set the SELinux security context to be used by\n"
148                "                            API/tmpfs file systems in the container\n"
149                "     --private-network      Disable network in container\n"
150                "     --network-interface=INTERFACE\n"
151                "                            Assign an existing network interface to the container\n"
152                "     --share-system         Share system namespaces with host\n"
153                "     --read-only            Mount the root directory read-only\n"
154                "     --capability=CAP       In addition to the default, retain specified\n"
155                "                            capability\n"
156                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
157                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
158                "  -j                        Equivalent to --link-journal=host\n"
159                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
160                "                            the container\n"
161                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
162                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
163                "     --register=BOOLEAN     Register container as machine\n"
164                "     --keep-unit            Do not register a scope for the machine, reuse\n"
165                "                            the service unit nspawn is running in\n"
166                "  -q --quiet                Do not show status information\n",
167                program_invocation_short_name);
168
169         return 0;
170 }
171
172 static int parse_argv(int argc, char *argv[]) {
173
174         enum {
175                 ARG_VERSION = 0x100,
176                 ARG_PRIVATE_NETWORK,
177                 ARG_UUID,
178                 ARG_READ_ONLY,
179                 ARG_CAPABILITY,
180                 ARG_DROP_CAPABILITY,
181                 ARG_LINK_JOURNAL,
182                 ARG_BIND,
183                 ARG_BIND_RO,
184                 ARG_SETENV,
185                 ARG_SHARE_SYSTEM,
186                 ARG_REGISTER,
187                 ARG_KEEP_UNIT,
188                 ARG_NETWORK_INTERFACE
189         };
190
191         static const struct option options[] = {
192                 { "help",                  no_argument,       NULL, 'h'                   },
193                 { "version",               no_argument,       NULL, ARG_VERSION           },
194                 { "directory",             required_argument, NULL, 'D'                   },
195                 { "user",                  required_argument, NULL, 'u'                   },
196                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
197                 { "boot",                  no_argument,       NULL, 'b'                   },
198                 { "uuid",                  required_argument, NULL, ARG_UUID              },
199                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
200                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
201                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
202                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
203                 { "bind",                  required_argument, NULL, ARG_BIND              },
204                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
205                 { "machine",               required_argument, NULL, 'M'                   },
206                 { "slice",                 required_argument, NULL, 'S'                   },
207                 { "setenv",                required_argument, NULL, ARG_SETENV            },
208                 { "selinux-context",       required_argument, NULL, 'Z'                   },
209                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
210                 { "quiet",                 no_argument,       NULL, 'q'                   },
211                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
212                 { "register",              required_argument, NULL, ARG_REGISTER          },
213                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
214                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
215                 {}
216         };
217
218         int c, r;
219
220         assert(argc >= 0);
221         assert(argv);
222
223         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
224
225                 switch (c) {
226
227                 case 'h':
228                         return help();
229
230                 case ARG_VERSION:
231                         puts(PACKAGE_STRING);
232                         puts(SYSTEMD_FEATURES);
233                         return 0;
234
235                 case 'D':
236                         free(arg_directory);
237                         arg_directory = canonicalize_file_name(optarg);
238                         if (!arg_directory) {
239                                 log_error("Invalid root directory: %m");
240                                 return -ENOMEM;
241                         }
242
243                         break;
244
245                 case 'u':
246                         free(arg_user);
247                         arg_user = strdup(optarg);
248                         if (!arg_user)
249                                 return log_oom();
250
251                         break;
252
253                 case ARG_NETWORK_INTERFACE:
254                         if (strv_push(&arg_network_interfaces, optarg) < 0)
255                                 return log_oom();
256
257                         /* fall through */
258
259                 case ARG_PRIVATE_NETWORK:
260                         arg_private_network = true;
261                         break;
262
263                 case 'b':
264                         arg_boot = true;
265                         break;
266
267                 case ARG_UUID:
268                         r = sd_id128_from_string(optarg, &arg_uuid);
269                         if (r < 0) {
270                                 log_error("Invalid UUID: %s", optarg);
271                                 return r;
272                         }
273                         break;
274
275                 case 'S':
276                         arg_slice = strdup(optarg);
277                         if (!arg_slice)
278                                 return log_oom();
279
280                         break;
281
282                 case 'M':
283                         if (isempty(optarg)) {
284                                 free(arg_machine);
285                                 arg_machine = NULL;
286                         } else {
287
288                                 if (!hostname_is_valid(optarg)) {
289                                         log_error("Invalid machine name: %s", optarg);
290                                         return -EINVAL;
291                                 }
292
293                                 free(arg_machine);
294                                 arg_machine = strdup(optarg);
295                                 if (!arg_machine)
296                                         return log_oom();
297
298                                 break;
299                         }
300
301                 case 'Z':
302                         arg_selinux_context = optarg;
303                         break;
304
305                 case 'L':
306                         arg_selinux_apifs_context = optarg;
307                         break;
308
309                 case ARG_READ_ONLY:
310                         arg_read_only = true;
311                         break;
312
313                 case ARG_CAPABILITY:
314                 case ARG_DROP_CAPABILITY: {
315                         char *state, *word;
316                         size_t length;
317
318                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
319                                 _cleanup_free_ char *t;
320                                 cap_value_t cap;
321
322                                 t = strndup(word, length);
323                                 if (!t)
324                                         return log_oom();
325
326                                 if (streq(t, "all")) {
327                                         if (c == ARG_CAPABILITY)
328                                                 arg_retain = (uint64_t) -1;
329                                         else
330                                                 arg_retain = 0;
331                                 } else {
332                                         if (cap_from_name(t, &cap) < 0) {
333                                                 log_error("Failed to parse capability %s.", t);
334                                                 return -EINVAL;
335                                         }
336
337                                         if (c == ARG_CAPABILITY)
338                                                 arg_retain |= 1ULL << (uint64_t) cap;
339                                         else
340                                                 arg_retain &= ~(1ULL << (uint64_t) cap);
341                                 }
342                         }
343
344                         break;
345                 }
346
347                 case 'j':
348                         arg_link_journal = LINK_GUEST;
349                         break;
350
351                 case ARG_LINK_JOURNAL:
352                         if (streq(optarg, "auto"))
353                                 arg_link_journal = LINK_AUTO;
354                         else if (streq(optarg, "no"))
355                                 arg_link_journal = LINK_NO;
356                         else if (streq(optarg, "guest"))
357                                 arg_link_journal = LINK_GUEST;
358                         else if (streq(optarg, "host"))
359                                 arg_link_journal = LINK_HOST;
360                         else {
361                                 log_error("Failed to parse link journal mode %s", optarg);
362                                 return -EINVAL;
363                         }
364
365                         break;
366
367                 case ARG_BIND:
368                 case ARG_BIND_RO: {
369                         _cleanup_free_ char *a = NULL, *b = NULL;
370                         char *e;
371                         char ***x;
372
373                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
374
375                         e = strchr(optarg, ':');
376                         if (e) {
377                                 a = strndup(optarg, e - optarg);
378                                 b = strdup(e + 1);
379                         } else {
380                                 a = strdup(optarg);
381                                 b = strdup(optarg);
382                         }
383
384                         if (!a || !b)
385                                 return log_oom();
386
387                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
388                                 log_error("Invalid bind mount specification: %s", optarg);
389                                 return -EINVAL;
390                         }
391
392                         r = strv_extend(x, a);
393                         if (r < 0)
394                                 return log_oom();
395
396                         r = strv_extend(x, b);
397                         if (r < 0)
398                                 return log_oom();
399
400                         break;
401                 }
402
403                 case ARG_SETENV: {
404                         char **n;
405
406                         if (!env_assignment_is_valid(optarg)) {
407                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
408                                 return -EINVAL;
409                         }
410
411                         n = strv_env_set(arg_setenv, optarg);
412                         if (!n)
413                                 return log_oom();
414
415                         strv_free(arg_setenv);
416                         arg_setenv = n;
417                         break;
418                 }
419
420                 case 'q':
421                         arg_quiet = true;
422                         break;
423
424                 case ARG_SHARE_SYSTEM:
425                         arg_share_system = true;
426                         break;
427
428                 case ARG_REGISTER:
429                         r = parse_boolean(optarg);
430                         if (r < 0) {
431                                 log_error("Failed to parse --register= argument: %s", optarg);
432                                 return r;
433                         }
434
435                         arg_register = r;
436                         break;
437
438                 case ARG_KEEP_UNIT:
439                         arg_keep_unit = true;
440                         break;
441
442                 case '?':
443                         return -EINVAL;
444
445                 default:
446                         assert_not_reached("Unhandled option");
447                 }
448         }
449
450         if (arg_share_system)
451                 arg_register = false;
452
453         if (arg_boot && arg_share_system) {
454                 log_error("--boot and --share-system may not be combined.");
455                 return -EINVAL;
456         }
457
458         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
459                 log_error("--keep-unit may not be used when invoked from a user session.");
460                 return -EINVAL;
461         }
462
463         return 1;
464 }
465
466 static int mount_all(const char *dest) {
467
468         typedef struct MountPoint {
469                 const char *what;
470                 const char *where;
471                 const char *type;
472                 const char *options;
473                 unsigned long flags;
474                 bool fatal;
475         } MountPoint;
476
477         static const MountPoint mount_table[] = {
478                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
479                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
480                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
481                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
482                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
483                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
484                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
485                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
486 #ifdef HAVE_SELINUX
487                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
488                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
489 #endif
490         };
491
492         unsigned k;
493         int r = 0;
494
495         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
496                 _cleanup_free_ char *where = NULL;
497 #ifdef HAVE_SELINUX
498                 _cleanup_free_ char *options = NULL;
499 #endif
500                 const char *o;
501                 int t;
502
503                 where = strjoin(dest, "/", mount_table[k].where, NULL);
504                 if (!where)
505                         return log_oom();
506
507                 t = path_is_mount_point(where, true);
508                 if (t < 0) {
509                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
510
511                         if (r == 0)
512                                 r = t;
513
514                         continue;
515                 }
516
517                 /* Skip this entry if it is not a remount. */
518                 if (mount_table[k].what && t > 0)
519                         continue;
520
521                 mkdir_p(where, 0755);
522
523 #ifdef HAVE_SELINUX
524                 if (arg_selinux_apifs_context &&
525                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
526                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
527                         if (!options)
528                                 return log_oom();
529
530                         o = options;
531                 } else
532 #endif
533                         o = mount_table[k].options;
534
535
536                 if (mount(mount_table[k].what,
537                           where,
538                           mount_table[k].type,
539                           mount_table[k].flags,
540                           o) < 0 &&
541                     mount_table[k].fatal) {
542
543                         log_error("mount(%s) failed: %m", where);
544
545                         if (r == 0)
546                                 r = -errno;
547                 }
548         }
549
550         return r;
551 }
552
553 static int mount_binds(const char *dest, char **l, unsigned long flags) {
554         char **x, **y;
555
556         STRV_FOREACH_PAIR(x, y, l) {
557                 char *where;
558                 struct stat source_st, dest_st;
559                 int r;
560
561                 if (stat(*x, &source_st) < 0) {
562                         log_error("failed to stat %s: %m", *x);
563                         return -errno;
564                 }
565
566                 where = strappenda(dest, *y);
567                 r = stat(where, &dest_st);
568                 if (r == 0) {
569                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
570                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
571                                                 *x, where);
572                                 return -EINVAL;
573                         }
574                 } else if (errno == ENOENT) {
575                         r = mkdir_parents_label(where, 0755);
576                         if (r < 0) {
577                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
578                                 return r;
579                         }
580                 } else {
581                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
582                         return -errno;
583                 }
584                 /* Create the mount point, but be conservative -- refuse to create block
585                 * and char devices. */
586                 if (S_ISDIR(source_st.st_mode))
587                         mkdir_label(where, 0755);
588                 else if (S_ISFIFO(source_st.st_mode))
589                         mkfifo(where, 0644);
590                 else if (S_ISSOCK(source_st.st_mode))
591                         mknod(where, 0644 | S_IFSOCK, 0);
592                 else if (S_ISREG(source_st.st_mode))
593                         touch(where);
594                 else {
595                         log_error("Refusing to create mountpoint for file: %s", *x);
596                         return -ENOTSUP;
597                 }
598
599                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
600                         log_error("mount(%s) failed: %m", where);
601                         return -errno;
602                 }
603
604                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
605                         log_error("mount(%s) failed: %m", where);
606                         return -errno;
607                 }
608         }
609
610         return 0;
611 }
612
613 static int setup_timezone(const char *dest) {
614         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
615         char *z, *y;
616         int r;
617
618         assert(dest);
619
620         /* Fix the timezone, if possible */
621         r = readlink_malloc("/etc/localtime", &p);
622         if (r < 0) {
623                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
624                 return 0;
625         }
626
627         z = path_startswith(p, "../usr/share/zoneinfo/");
628         if (!z)
629                 z = path_startswith(p, "/usr/share/zoneinfo/");
630         if (!z) {
631                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
632                 return 0;
633         }
634
635         where = strappend(dest, "/etc/localtime");
636         if (!where)
637                 return log_oom();
638
639         r = readlink_malloc(where, &q);
640         if (r >= 0) {
641                 y = path_startswith(q, "../usr/share/zoneinfo/");
642                 if (!y)
643                         y = path_startswith(q, "/usr/share/zoneinfo/");
644
645
646                 /* Already pointing to the right place? Then do nothing .. */
647                 if (y && streq(y, z))
648                         return 0;
649         }
650
651         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
652         if (!check)
653                 return log_oom();
654
655         if (access(check, F_OK) < 0) {
656                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
657                 return 0;
658         }
659
660         what = strappend("../usr/share/zoneinfo/", z);
661         if (!what)
662                 return log_oom();
663
664         unlink(where);
665         if (symlink(what, where) < 0) {
666                 log_error("Failed to correct timezone of container: %m");
667                 return 0;
668         }
669
670         return 0;
671 }
672
673 static int setup_resolv_conf(const char *dest) {
674         char _cleanup_free_ *where = NULL;
675
676         assert(dest);
677
678         if (arg_private_network)
679                 return 0;
680
681         /* Fix resolv.conf, if possible */
682         where = strappend(dest, "/etc/resolv.conf");
683         if (!where)
684                 return log_oom();
685
686         /* We don't really care for the results of this really. If it
687          * fails, it fails, but meh... */
688         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
689
690         return 0;
691 }
692
693 static int setup_boot_id(const char *dest) {
694         _cleanup_free_ char *from = NULL, *to = NULL;
695         sd_id128_t rnd;
696         char as_uuid[37];
697         int r;
698
699         assert(dest);
700
701         if (arg_share_system)
702                 return 0;
703
704         /* Generate a new randomized boot ID, so that each boot-up of
705          * the container gets a new one */
706
707         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
708         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
709         if (!from || !to)
710                 return log_oom();
711
712         r = sd_id128_randomize(&rnd);
713         if (r < 0) {
714                 log_error("Failed to generate random boot id: %s", strerror(-r));
715                 return r;
716         }
717
718         snprintf(as_uuid, sizeof(as_uuid),
719                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
720                  SD_ID128_FORMAT_VAL(rnd));
721         char_array_0(as_uuid);
722
723         r = write_string_file(from, as_uuid);
724         if (r < 0) {
725                 log_error("Failed to write boot id: %s", strerror(-r));
726                 return r;
727         }
728
729         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
730                 log_error("Failed to bind mount boot id: %m");
731                 r = -errno;
732         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
733                 log_warning("Failed to make boot id read-only: %m");
734
735         unlink(from);
736         return r;
737 }
738
739 static int copy_devnodes(const char *dest) {
740
741         static const char devnodes[] =
742                 "null\0"
743                 "zero\0"
744                 "full\0"
745                 "random\0"
746                 "urandom\0"
747                 "tty\0";
748
749         const char *d;
750         int r = 0;
751         _cleanup_umask_ mode_t u;
752
753         assert(dest);
754
755         u = umask(0000);
756
757         NULSTR_FOREACH(d, devnodes) {
758                 _cleanup_free_ char *from = NULL, *to = NULL;
759                 struct stat st;
760
761                 from = strappend("/dev/", d);
762                 to = strjoin(dest, "/dev/", d, NULL);
763                 if (!from || !to)
764                         return log_oom();
765
766                 if (stat(from, &st) < 0) {
767
768                         if (errno != ENOENT) {
769                                 log_error("Failed to stat %s: %m", from);
770                                 return -errno;
771                         }
772
773                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
774
775                         log_error("%s is not a char or block device, cannot copy", from);
776                         return -EIO;
777
778                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
779
780                         log_error("mknod(%s) failed: %m", dest);
781                         return  -errno;
782                 }
783         }
784
785         return r;
786 }
787
788 static int setup_ptmx(const char *dest) {
789         _cleanup_free_ char *p = NULL;
790
791         p = strappend(dest, "/dev/ptmx");
792         if (!p)
793                 return log_oom();
794
795         if (symlink("pts/ptmx", p) < 0) {
796                 log_error("Failed to create /dev/ptmx symlink: %m");
797                 return -errno;
798         }
799
800         return 0;
801 }
802
803 static int setup_dev_console(const char *dest, const char *console) {
804         struct stat st;
805         _cleanup_free_ char *to = NULL;
806         int r;
807         _cleanup_umask_ mode_t u;
808
809         assert(dest);
810         assert(console);
811
812         u = umask(0000);
813
814         if (stat(console, &st) < 0) {
815                 log_error("Failed to stat %s: %m", console);
816                 return -errno;
817
818         } else if (!S_ISCHR(st.st_mode)) {
819                 log_error("/dev/console is not a char device");
820                 return -EIO;
821         }
822
823         r = chmod_and_chown(console, 0600, 0, 0);
824         if (r < 0) {
825                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
826                 return r;
827         }
828
829         if (asprintf(&to, "%s/dev/console", dest) < 0)
830                 return log_oom();
831
832         /* We need to bind mount the right tty to /dev/console since
833          * ptys can only exist on pts file systems. To have something
834          * to bind mount things on we create a device node first, that
835          * has the right major/minor (note that the major minor
836          * doesn't actually matter here, since we mount it over
837          * anyway). */
838
839         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
840                 log_error("mknod() for /dev/console failed: %m");
841                 return -errno;
842         }
843
844         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
845                 log_error("Bind mount for /dev/console failed: %m");
846                 return -errno;
847         }
848
849         return 0;
850 }
851
852 static int setup_kmsg(const char *dest, int kmsg_socket) {
853         _cleanup_free_ char *from = NULL, *to = NULL;
854         int r, fd, k;
855         _cleanup_umask_ mode_t u;
856         union {
857                 struct cmsghdr cmsghdr;
858                 uint8_t buf[CMSG_SPACE(sizeof(int))];
859         } control = {};
860         struct msghdr mh = {
861                 .msg_control = &control,
862                 .msg_controllen = sizeof(control),
863         };
864         struct cmsghdr *cmsg;
865
866         assert(dest);
867         assert(kmsg_socket >= 0);
868
869         u = umask(0000);
870
871         /* We create the kmsg FIFO as /dev/kmsg, but immediately
872          * delete it after bind mounting it to /proc/kmsg. While FIFOs
873          * on the reading side behave very similar to /proc/kmsg,
874          * their writing side behaves differently from /dev/kmsg in
875          * that writing blocks when nothing is reading. In order to
876          * avoid any problems with containers deadlocking due to this
877          * we simply make /dev/kmsg unavailable to the container. */
878         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
879             asprintf(&to, "%s/proc/kmsg", dest) < 0)
880                 return log_oom();
881
882         if (mkfifo(from, 0600) < 0) {
883                 log_error("mkfifo() for /dev/kmsg failed: %m");
884                 return -errno;
885         }
886
887         r = chmod_and_chown(from, 0600, 0, 0);
888         if (r < 0) {
889                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
890                 return r;
891         }
892
893         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
894                 log_error("Bind mount for /proc/kmsg failed: %m");
895                 return -errno;
896         }
897
898         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
899         if (fd < 0) {
900                 log_error("Failed to open fifo: %m");
901                 return -errno;
902         }
903
904         cmsg = CMSG_FIRSTHDR(&mh);
905         cmsg->cmsg_level = SOL_SOCKET;
906         cmsg->cmsg_type = SCM_RIGHTS;
907         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
908         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
909
910         mh.msg_controllen = cmsg->cmsg_len;
911
912         /* Store away the fd in the socket, so that it stays open as
913          * long as we run the child */
914         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
915         close_nointr_nofail(fd);
916
917         if (k < 0) {
918                 log_error("Failed to send FIFO fd: %m");
919                 return -errno;
920         }
921
922         /* And now make the FIFO unavailable as /dev/kmsg... */
923         unlink(from);
924         return 0;
925 }
926
927 static int setup_hostname(void) {
928
929         if (arg_share_system)
930                 return 0;
931
932         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
933                 return -errno;
934
935         return 0;
936 }
937
938 static int setup_journal(const char *directory) {
939         sd_id128_t machine_id, this_id;
940         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
941         char *id;
942         int r;
943
944         p = strappend(directory, "/etc/machine-id");
945         if (!p)
946                 return log_oom();
947
948         r = read_one_line_file(p, &b);
949         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
950                 return 0;
951         else if (r < 0) {
952                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
953                 return r;
954         }
955
956         id = strstrip(b);
957         if (isempty(id) && arg_link_journal == LINK_AUTO)
958                 return 0;
959
960         /* Verify validity */
961         r = sd_id128_from_string(id, &machine_id);
962         if (r < 0) {
963                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
964                 return r;
965         }
966
967         r = sd_id128_get_machine(&this_id);
968         if (r < 0) {
969                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
970                 return r;
971         }
972
973         if (sd_id128_equal(machine_id, this_id)) {
974                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
975                          "Host and machine ids are equal (%s): refusing to link journals", id);
976                 if (arg_link_journal == LINK_AUTO)
977                         return 0;
978                 return
979                         -EEXIST;
980         }
981
982         if (arg_link_journal == LINK_NO)
983                 return 0;
984
985         free(p);
986         p = strappend("/var/log/journal/", id);
987         q = strjoin(directory, "/var/log/journal/", id, NULL);
988         if (!p || !q)
989                 return log_oom();
990
991         if (path_is_mount_point(p, false) > 0) {
992                 if (arg_link_journal != LINK_AUTO) {
993                         log_error("%s: already a mount point, refusing to use for journal", p);
994                         return -EEXIST;
995                 }
996
997                 return 0;
998         }
999
1000         if (path_is_mount_point(q, false) > 0) {
1001                 if (arg_link_journal != LINK_AUTO) {
1002                         log_error("%s: already a mount point, refusing to use for journal", q);
1003                         return -EEXIST;
1004                 }
1005
1006                 return 0;
1007         }
1008
1009         r = readlink_and_make_absolute(p, &d);
1010         if (r >= 0) {
1011                 if ((arg_link_journal == LINK_GUEST ||
1012                      arg_link_journal == LINK_AUTO) &&
1013                     path_equal(d, q)) {
1014
1015                         r = mkdir_p(q, 0755);
1016                         if (r < 0)
1017                                 log_warning("failed to create directory %s: %m", q);
1018                         return 0;
1019                 }
1020
1021                 if (unlink(p) < 0) {
1022                         log_error("Failed to remove symlink %s: %m", p);
1023                         return -errno;
1024                 }
1025         } else if (r == -EINVAL) {
1026
1027                 if (arg_link_journal == LINK_GUEST &&
1028                     rmdir(p) < 0) {
1029
1030                         if (errno == ENOTDIR) {
1031                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1032                                 return r;
1033                         } else {
1034                                 log_error("Failed to remove %s: %m", p);
1035                                 return -errno;
1036                         }
1037                 }
1038         } else if (r != -ENOENT) {
1039                 log_error("readlink(%s) failed: %m", p);
1040                 return r;
1041         }
1042
1043         if (arg_link_journal == LINK_GUEST) {
1044
1045                 if (symlink(q, p) < 0) {
1046                         log_error("Failed to symlink %s to %s: %m", q, p);
1047                         return -errno;
1048                 }
1049
1050                 r = mkdir_p(q, 0755);
1051                 if (r < 0)
1052                         log_warning("failed to create directory %s: %m", q);
1053                 return 0;
1054         }
1055
1056         if (arg_link_journal == LINK_HOST) {
1057                 r = mkdir_p(p, 0755);
1058                 if (r < 0) {
1059                         log_error("Failed to create %s: %m", p);
1060                         return r;
1061                 }
1062
1063         } else if (access(p, F_OK) < 0)
1064                 return 0;
1065
1066         if (dir_is_empty(q) == 0) {
1067                 log_error("%s not empty.", q);
1068                 return -ENOTEMPTY;
1069         }
1070
1071         r = mkdir_p(q, 0755);
1072         if (r < 0) {
1073                 log_error("Failed to create %s: %m", q);
1074                 return r;
1075         }
1076
1077         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1078                 log_error("Failed to bind mount journal from host into guest: %m");
1079                 return -errno;
1080         }
1081
1082         return 0;
1083 }
1084
1085 static int setup_kdbus(const char *dest, const char *path) {
1086         const char *p;
1087
1088         if (!path)
1089                 return 0;
1090
1091         p = strappenda(dest, "/dev/kdbus");
1092         if (mkdir(p, 0755) < 0) {
1093                 log_error("Failed to create kdbus path: %m");
1094                 return  -errno;
1095         }
1096
1097         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1098                 log_error("Failed to mount kdbus domain path: %m");
1099                 return -errno;
1100         }
1101
1102         return 0;
1103 }
1104
1105 static int drop_capabilities(void) {
1106         return capability_bounding_set_drop(~arg_retain, false);
1107 }
1108
1109 static int register_machine(pid_t pid) {
1110         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1111         _cleanup_bus_unref_ sd_bus *bus = NULL;
1112         int r;
1113
1114         if (!arg_register)
1115                 return 0;
1116
1117         r = sd_bus_default_system(&bus);
1118         if (r < 0) {
1119                 log_error("Failed to open system bus: %s", strerror(-r));
1120                 return r;
1121         }
1122
1123         if (arg_keep_unit) {
1124                 r = sd_bus_call_method(
1125                                 bus,
1126                                 "org.freedesktop.machine1",
1127                                 "/org/freedesktop/machine1",
1128                                 "org.freedesktop.machine1.Manager",
1129                                 "RegisterMachine",
1130                                 &error,
1131                                 NULL,
1132                                 "sayssus",
1133                                 arg_machine,
1134                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1135                                 "nspawn",
1136                                 "container",
1137                                 (uint32_t) pid,
1138                                 strempty(arg_directory));
1139         } else {
1140                 r = sd_bus_call_method(
1141                                 bus,
1142                                 "org.freedesktop.machine1",
1143                                 "/org/freedesktop/machine1",
1144                                 "org.freedesktop.machine1.Manager",
1145                                 "CreateMachine",
1146                                 &error,
1147                                 NULL,
1148                                 "sayssusa(sv)",
1149                                 arg_machine,
1150                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1151                                 "nspawn",
1152                                 "container",
1153                                 (uint32_t) pid,
1154                                 strempty(arg_directory),
1155                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1156         }
1157
1158         if (r < 0) {
1159                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1160                 return r;
1161         }
1162
1163         return 0;
1164 }
1165
1166 static int terminate_machine(pid_t pid) {
1167         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1168         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1169         _cleanup_bus_unref_ sd_bus *bus = NULL;
1170         const char *path;
1171         int r;
1172
1173         if (!arg_register)
1174                 return 0;
1175
1176         r = sd_bus_default_system(&bus);
1177         if (r < 0) {
1178                 log_error("Failed to open system bus: %s", strerror(-r));
1179                 return r;
1180         }
1181
1182         r = sd_bus_call_method(
1183                         bus,
1184                         "org.freedesktop.machine1",
1185                         "/org/freedesktop/machine1",
1186                         "org.freedesktop.machine1.Manager",
1187                         "GetMachineByPID",
1188                         &error,
1189                         &reply,
1190                         "u",
1191                         (uint32_t) pid);
1192         if (r < 0) {
1193                 /* Note that the machine might already have been
1194                  * cleaned up automatically, hence don't consider it a
1195                  * failure if we cannot get the machine object. */
1196                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1197                 return 0;
1198         }
1199
1200         r = sd_bus_message_read(reply, "o", &path);
1201         if (r < 0)
1202                 return bus_log_parse_error(r);
1203
1204         r = sd_bus_call_method(
1205                         bus,
1206                         "org.freedesktop.machine1",
1207                         path,
1208                         "org.freedesktop.machine1.Machine",
1209                         "Terminate",
1210                         &error,
1211                         NULL,
1212                         NULL);
1213         if (r < 0) {
1214                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1215                 return 0;
1216         }
1217
1218         return 0;
1219 }
1220
1221 static int reset_audit_loginuid(void) {
1222         _cleanup_free_ char *p = NULL;
1223         int r;
1224
1225         if (arg_share_system)
1226                 return 0;
1227
1228         r = read_one_line_file("/proc/self/loginuid", &p);
1229         if (r == -EEXIST)
1230                 return 0;
1231         if (r < 0) {
1232                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1233                 return r;
1234         }
1235
1236         /* Already reset? */
1237         if (streq(p, "4294967295"))
1238                 return 0;
1239
1240         r = write_string_file("/proc/self/loginuid", "4294967295");
1241         if (r < 0) {
1242                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1243                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1244                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1245                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1246                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1247
1248                 sleep(5);
1249         }
1250
1251         return 0;
1252 }
1253
1254 static int move_network_interfaces(pid_t pid) {
1255         _cleanup_sd_rtnl_unref_ sd_rtnl *rtnl = NULL;
1256         char **i;
1257         int r;
1258
1259         if (!arg_private_network)
1260                 return 0;
1261
1262         if (strv_isempty(arg_network_interfaces))
1263                 return 0;
1264
1265         r = sd_rtnl_open(NETLINK_ROUTE, &rtnl);
1266         if (r < 0) {
1267                 log_error("Failed to connect to netlink: %s", strerror(-r));
1268                 return r;
1269         }
1270
1271         STRV_FOREACH(i, arg_network_interfaces) {
1272                 _cleanup_sd_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1273                 unsigned ifi;
1274
1275                 ifi = if_nametoindex(*i);
1276                 if (ifi == 0) {
1277                         log_error("Failed to resolve interface %s: %m", *i);
1278                         return -errno;
1279                 }
1280
1281                 r = sd_rtnl_message_link_new(RTM_NEWLINK, ifi, &m);
1282                 if (r < 0) {
1283                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1284                         return r;
1285                 }
1286
1287                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1288                 if (r < 0) {
1289                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1290                         return r;
1291                 }
1292
1293                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1294                 if (r < 0) {
1295                         log_error("Failed to move interface to namespace: %s", strerror(-r));
1296                         return r;
1297                 }
1298         }
1299
1300         return 0;
1301 }
1302
1303 int main(int argc, char *argv[]) {
1304         pid_t pid = 0;
1305         int r = EXIT_FAILURE, k;
1306         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1307         int n_fd_passed;
1308         const char *console = NULL;
1309         sigset_t mask;
1310         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1311         _cleanup_fdset_free_ FDSet *fds = NULL;
1312         _cleanup_free_ char *kdbus_domain = NULL;
1313
1314         log_parse_environment();
1315         log_open();
1316
1317         k = parse_argv(argc, argv);
1318         if (k < 0)
1319                 goto finish;
1320         else if (k == 0) {
1321                 r = EXIT_SUCCESS;
1322                 goto finish;
1323         }
1324
1325         if (arg_directory) {
1326                 char *p;
1327
1328                 p = path_make_absolute_cwd(arg_directory);
1329                 free(arg_directory);
1330                 arg_directory = p;
1331         } else
1332                 arg_directory = get_current_dir_name();
1333
1334         if (!arg_directory) {
1335                 log_error("Failed to determine path, please use -D.");
1336                 goto finish;
1337         }
1338
1339         path_kill_slashes(arg_directory);
1340
1341         if (!arg_machine) {
1342                 arg_machine = strdup(basename(arg_directory));
1343                 if (!arg_machine) {
1344                         log_oom();
1345                         goto finish;
1346                 }
1347
1348                 hostname_cleanup(arg_machine, false);
1349                 if (isempty(arg_machine)) {
1350                         log_error("Failed to determine machine name automatically, please use -M.");
1351                         goto finish;
1352                 }
1353         }
1354
1355         if (geteuid() != 0) {
1356                 log_error("Need to be root.");
1357                 goto finish;
1358         }
1359
1360         if (sd_booted() <= 0) {
1361                 log_error("Not running on a systemd system.");
1362                 goto finish;
1363         }
1364
1365         if (path_equal(arg_directory, "/")) {
1366                 log_error("Spawning container on root directory not supported.");
1367                 goto finish;
1368         }
1369
1370         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1371                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1372                 goto finish;
1373         }
1374
1375         log_close();
1376         n_fd_passed = sd_listen_fds(false);
1377         if (n_fd_passed > 0) {
1378                 k = fdset_new_listen_fds(&fds, false);
1379                 if (k < 0) {
1380                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1381                         goto finish;
1382                 }
1383         }
1384         fdset_close_others(fds);
1385         log_open();
1386
1387         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1388         if (master < 0) {
1389                 log_error("Failed to acquire pseudo tty: %m");
1390                 goto finish;
1391         }
1392
1393         console = ptsname(master);
1394         if (!console) {
1395                 log_error("Failed to determine tty name: %m");
1396                 goto finish;
1397         }
1398
1399         if (!arg_quiet)
1400                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1401
1402         if (unlockpt(master) < 0) {
1403                 log_error("Failed to unlock tty: %m");
1404                 goto finish;
1405         }
1406
1407
1408         if (access("/dev/kdbus/control", F_OK) >= 0) {
1409
1410                 if (arg_share_system) {
1411                         kdbus_domain = strdup("/dev/kdbus");
1412                         if (!kdbus_domain) {
1413                                 log_oom();
1414                                 goto finish;
1415                         }
1416                 } else {
1417                         const char *ns;
1418
1419                         ns = strappenda("machine-", arg_machine);
1420                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1421                         if (r < 0)
1422                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1423                         else
1424                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1425                 }
1426         }
1427
1428         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1429                 log_error("Failed to create kmsg socket pair: %m");
1430                 goto finish;
1431         }
1432
1433         sd_notify(0, "READY=1");
1434
1435         assert_se(sigemptyset(&mask) == 0);
1436         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1437         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1438
1439         for (;;) {
1440                 siginfo_t status;
1441
1442                 sync_fd = eventfd(0, EFD_CLOEXEC);
1443                 if (sync_fd < 0) {
1444                         log_error("Failed to create event fd: %m");
1445                         goto finish;
1446                 }
1447
1448                 pid = syscall(__NR_clone,
1449                               SIGCHLD|CLONE_NEWNS|
1450                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1451                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1452                 if (pid < 0) {
1453                         if (errno == EINVAL)
1454                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1455                         else
1456                                 log_error("clone() failed: %m");
1457
1458                         goto finish;
1459                 }
1460
1461                 if (pid == 0) {
1462                         /* child */
1463                         const char *home = NULL;
1464                         uid_t uid = (uid_t) -1;
1465                         gid_t gid = (gid_t) -1;
1466                         unsigned n_env = 2;
1467                         const char *envp[] = {
1468                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1469                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1470                                 NULL, /* TERM */
1471                                 NULL, /* HOME */
1472                                 NULL, /* USER */
1473                                 NULL, /* LOGNAME */
1474                                 NULL, /* container_uuid */
1475                                 NULL, /* LISTEN_FDS */
1476                                 NULL, /* LISTEN_PID */
1477                                 NULL
1478                         };
1479                         char **env_use;
1480                         eventfd_t x;
1481
1482                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1483                         if (envp[n_env])
1484                                 n_env ++;
1485
1486                         close_nointr_nofail(master);
1487                         master = -1;
1488
1489                         close_nointr(STDIN_FILENO);
1490                         close_nointr(STDOUT_FILENO);
1491                         close_nointr(STDERR_FILENO);
1492
1493                         close_nointr_nofail(kmsg_socket_pair[0]);
1494                         kmsg_socket_pair[0] = -1;
1495
1496                         reset_all_signal_handlers();
1497
1498                         assert_se(sigemptyset(&mask) == 0);
1499                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1500
1501                         k = open_terminal(console, O_RDWR);
1502                         if (k != STDIN_FILENO) {
1503                                 if (k >= 0) {
1504                                         close_nointr_nofail(k);
1505                                         k = -EINVAL;
1506                                 }
1507
1508                                 log_error("Failed to open console: %s", strerror(-k));
1509                                 goto child_fail;
1510                         }
1511
1512                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1513                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1514                                 log_error("Failed to duplicate console: %m");
1515                                 goto child_fail;
1516                         }
1517
1518                         if (setsid() < 0) {
1519                                 log_error("setsid() failed: %m");
1520                                 goto child_fail;
1521                         }
1522
1523                         if (reset_audit_loginuid() < 0)
1524                                 goto child_fail;
1525
1526                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1527                                 log_error("PR_SET_PDEATHSIG failed: %m");
1528                                 goto child_fail;
1529                         }
1530
1531                         /* Mark everything as slave, so that we still
1532                          * receive mounts from the real root, but don't
1533                          * propagate mounts to the real root. */
1534                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1535                                 log_error("MS_SLAVE|MS_REC failed: %m");
1536                                 goto child_fail;
1537                         }
1538
1539                         /* Turn directory into bind mount */
1540                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1541                                 log_error("Failed to make bind mount.");
1542                                 goto child_fail;
1543                         }
1544
1545                         if (arg_read_only)
1546                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1547                                         log_error("Failed to make read-only.");
1548                                         goto child_fail;
1549                                 }
1550
1551                         if (mount_all(arg_directory) < 0)
1552                                 goto child_fail;
1553
1554                         if (copy_devnodes(arg_directory) < 0)
1555                                 goto child_fail;
1556
1557                         if (setup_ptmx(arg_directory) < 0)
1558                                 goto child_fail;
1559
1560                         dev_setup(arg_directory);
1561
1562                         if (setup_dev_console(arg_directory, console) < 0)
1563                                 goto child_fail;
1564
1565                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1566                                 goto child_fail;
1567
1568                         close_nointr_nofail(kmsg_socket_pair[1]);
1569                         kmsg_socket_pair[1] = -1;
1570
1571                         if (setup_boot_id(arg_directory) < 0)
1572                                 goto child_fail;
1573
1574                         if (setup_timezone(arg_directory) < 0)
1575                                 goto child_fail;
1576
1577                         if (setup_resolv_conf(arg_directory) < 0)
1578                                 goto child_fail;
1579
1580                         if (setup_journal(arg_directory) < 0)
1581                                 goto child_fail;
1582
1583                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1584                                 goto child_fail;
1585
1586                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1587                                 goto child_fail;
1588
1589                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1590                                 goto child_fail;
1591
1592                         if (chdir(arg_directory) < 0) {
1593                                 log_error("chdir(%s) failed: %m", arg_directory);
1594                                 goto child_fail;
1595                         }
1596
1597                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1598                                 log_error("mount(MS_MOVE) failed: %m");
1599                                 goto child_fail;
1600                         }
1601
1602                         if (chroot(".") < 0) {
1603                                 log_error("chroot() failed: %m");
1604                                 goto child_fail;
1605                         }
1606
1607                         if (chdir("/") < 0) {
1608                                 log_error("chdir() failed: %m");
1609                                 goto child_fail;
1610                         }
1611
1612                         umask(0022);
1613
1614                         if (arg_private_network)
1615                                 loopback_setup();
1616
1617                         if (drop_capabilities() < 0) {
1618                                 log_error("drop_capabilities() failed: %m");
1619                                 goto child_fail;
1620                         }
1621
1622                         if (arg_user) {
1623
1624                                 /* Note that this resolves user names
1625                                  * inside the container, and hence
1626                                  * accesses the NSS modules from the
1627                                  * container and not the host. This is
1628                                  * a bit weird... */
1629
1630                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1631                                         log_error("get_user_creds() failed: %m");
1632                                         goto child_fail;
1633                                 }
1634
1635                                 if (mkdir_parents_label(home, 0775) < 0) {
1636                                         log_error("mkdir_parents_label() failed: %m");
1637                                         goto child_fail;
1638                                 }
1639
1640                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1641                                         log_error("mkdir_safe_label() failed: %m");
1642                                         goto child_fail;
1643                                 }
1644
1645                                 if (initgroups((const char*)arg_user, gid) < 0) {
1646                                         log_error("initgroups() failed: %m");
1647                                         goto child_fail;
1648                                 }
1649
1650                                 if (setresgid(gid, gid, gid) < 0) {
1651                                         log_error("setregid() failed: %m");
1652                                         goto child_fail;
1653                                 }
1654
1655                                 if (setresuid(uid, uid, uid) < 0) {
1656                                         log_error("setreuid() failed: %m");
1657                                         goto child_fail;
1658                                 }
1659                         } else {
1660                                 /* Reset everything fully to 0, just in case */
1661
1662                                 if (setgroups(0, NULL) < 0) {
1663                                         log_error("setgroups() failed: %m");
1664                                         goto child_fail;
1665                                 }
1666
1667                                 if (setresgid(0, 0, 0) < 0) {
1668                                         log_error("setregid() failed: %m");
1669                                         goto child_fail;
1670                                 }
1671
1672                                 if (setresuid(0, 0, 0) < 0) {
1673                                         log_error("setreuid() failed: %m");
1674                                         goto child_fail;
1675                                 }
1676                         }
1677
1678                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1679                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1680                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1681                                 log_oom();
1682                                 goto child_fail;
1683                         }
1684
1685                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1686                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1687                                         log_oom();
1688                                         goto child_fail;
1689                                 }
1690                         }
1691
1692                         if (fdset_size(fds) > 0) {
1693                                 k = fdset_cloexec(fds, false);
1694                                 if (k < 0) {
1695                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1696                                         goto child_fail;
1697                                 }
1698
1699                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1700                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1701                                         log_oom();
1702                                         goto child_fail;
1703                                 }
1704                         }
1705
1706                         setup_hostname();
1707
1708                         eventfd_read(sync_fd, &x);
1709                         close_nointr_nofail(sync_fd);
1710                         sync_fd = -1;
1711
1712                         if (!strv_isempty(arg_setenv)) {
1713                                 char **n;
1714
1715                                 n = strv_env_merge(2, envp, arg_setenv);
1716                                 if (!n) {
1717                                         log_oom();
1718                                         goto child_fail;
1719                                 }
1720
1721                                 env_use = n;
1722                         } else
1723                                 env_use = (char**) envp;
1724
1725 #ifdef HAVE_SELINUX
1726                         if (arg_selinux_context)
1727                                 if (setexeccon(arg_selinux_context) < 0)
1728                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1729 #endif
1730                         if (arg_boot) {
1731                                 char **a;
1732                                 size_t l;
1733
1734                                 /* Automatically search for the init system */
1735
1736                                 l = 1 + argc - optind;
1737                                 a = newa(char*, l + 1);
1738                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1739
1740                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1741                                 execve(a[0], a, env_use);
1742
1743                                 a[0] = (char*) "/lib/systemd/systemd";
1744                                 execve(a[0], a, env_use);
1745
1746                                 a[0] = (char*) "/sbin/init";
1747                                 execve(a[0], a, env_use);
1748                         } else if (argc > optind)
1749                                 execvpe(argv[optind], argv + optind, env_use);
1750                         else {
1751                                 chdir(home ? home : "/root");
1752                                 execle("/bin/bash", "-bash", NULL, env_use);
1753                         }
1754
1755                         log_error("execv() failed: %m");
1756
1757                 child_fail:
1758                         _exit(EXIT_FAILURE);
1759                 }
1760
1761                 fdset_free(fds);
1762                 fds = NULL;
1763
1764                 r = register_machine(pid);
1765                 if (r < 0)
1766                         goto finish;
1767
1768                 r = move_network_interfaces(pid);
1769                 if (r < 0)
1770                         goto finish;
1771
1772                 eventfd_write(sync_fd, 1);
1773                 close_nointr_nofail(sync_fd);
1774                 sync_fd = -1;
1775
1776                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1777                 if (k < 0) {
1778                         r = EXIT_FAILURE;
1779                         break;
1780                 }
1781
1782                 if (!arg_quiet)
1783                         putc('\n', stdout);
1784
1785                 /* Kill if it is not dead yet anyway */
1786                 terminate_machine(pid);
1787
1788                 /* Redundant, but better safe than sorry */
1789                 kill(pid, SIGKILL);
1790
1791                 k = wait_for_terminate(pid, &status);
1792                 pid = 0;
1793
1794                 if (k < 0) {
1795                         r = EXIT_FAILURE;
1796                         break;
1797                 }
1798
1799                 if (status.si_code == CLD_EXITED) {
1800                         r = status.si_status;
1801                         if (status.si_status != 0) {
1802                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1803                                 break;
1804                         }
1805
1806                         if (!arg_quiet)
1807                                 log_debug("Container %s exited successfully.", arg_machine);
1808                         break;
1809                 } else if (status.si_code == CLD_KILLED &&
1810                            status.si_status == SIGINT) {
1811
1812                         if (!arg_quiet)
1813                                 log_info("Container %s has been shut down.", arg_machine);
1814                         r = 0;
1815                         break;
1816                 } else if (status.si_code == CLD_KILLED &&
1817                            status.si_status == SIGHUP) {
1818
1819                         if (!arg_quiet)
1820                                 log_info("Container %s is being rebooted.", arg_machine);
1821                         continue;
1822                 } else if (status.si_code == CLD_KILLED ||
1823                            status.si_code == CLD_DUMPED) {
1824
1825                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1826                         r = EXIT_FAILURE;
1827                         break;
1828                 } else {
1829                         log_error("Container %s failed due to unknown reason.", arg_machine);
1830                         r = EXIT_FAILURE;
1831                         break;
1832                 }
1833         }
1834
1835 finish:
1836         if (pid > 0)
1837                 kill(pid, SIGKILL);
1838
1839         free(arg_directory);
1840         free(arg_machine);
1841         free(arg_setenv);
1842         free(arg_network_interfaces);
1843
1844         return r;
1845 }